1 //========================================================================
2 //
3 // ABWOutputDev.cc
4 //
5 // Copyright 2006-2007 Jauco Noordzij <jauco@jauco.nl>
6 // Copyright 2007 Dominic Lachowicz <cinamod@hotmail.com>
7 // Copyright 2008 Hib Eris <hib@hiberis.nl>
8 //
9 // Based somewhat on HtmlOutputDev.cc
10 //
11 //========================================================================
12
13 #ifdef __GNUC__
14 #pragma implementation
15 #endif
16
17 #include "config.h"
18 #include <stdio.h>
19 #include <stdlib.h>
20 #include <stdarg.h>
21 #include <stddef.h>
22 #include <ctype.h>
23 #include <math.h>
24 #include "goo/GooString.h"
25 #include "goo/GooList.h"
26 #include "UnicodeMap.h"
27 #include "goo/gmem.h"
28 #include "Error.h"
29 #include "GfxState.h"
30 #include "GlobalParams.h"
31 #include "ABWOutputDev.h"
32 #include "PDFDoc.h"
33
34 #include <libxml/parser.h>
35 #include <libxml/tree.h>
36 #include <libxml/xpath.h>
37 #include <libxml/xpathInternals.h>
38
39
40 // Inter-character space width which will cause addChar to start a new
41 // word.
42 #define minWordBreakSpace 0.1
43
44 // Maximum inter-word spacing, as a fraction of the font size.
45 #define maxWordSpacing 1.5
46
47 // Max distance between baselines of two lines within a block, as a
48 // fraction of the font size.
49 #define maxLineSpacingDelta 1.5
50
51 #define C_maxVCutValue 4
52 #define C_maxHCutValue 5
53 //------------------------------------------------------------------------
54 // ABWOutputDev
55 //------------------------------------------------------------------------
56
ABWOutputDev(xmlDocPtr ext_doc)57 ABWOutputDev::ABWOutputDev(xmlDocPtr ext_doc)
58 {
59 pdfdoc = NULL;
60 N_page = N_style = N_text = N_styleset = N_Block = N_word = NULL;
61 doc = ext_doc;
62 N_root = xmlNewNode(NULL, BAD_CAST "abiword");
63 xmlDocSetRootElement(doc, N_root);
64 N_styleset = xmlNewChild(N_root, NULL, BAD_CAST "styles", NULL);
65 N_content = xmlNewChild(N_root, NULL, BAD_CAST "content", NULL);
66 uMap = globalParams->getTextEncoding();
67 maxStyle = Style = 1;
68 }
69
~ABWOutputDev()70 ABWOutputDev::~ABWOutputDev() {
71 xmlCleanupParser();
72 }
73
startPage(int pageNum,GfxState * state)74 void ABWOutputDev::startPage(int pageNum, GfxState *state) {
75 /*While reading a pdf page this node acts as a placeholder parent.
76 when conversion is finished and the page is structured as we like it
77 all text fragments are moved from N_page to N_content.*/
78 N_page = xmlNewNode(NULL, BAD_CAST "page");
79 G_pageNum = pageNum;
80 }
81
82 /*Callback to denote that poppler reached the end of a page
83 here I insert most of the interesting processing stuff*/
endPage()84 void ABWOutputDev::endPage() {
85 //make sure all words are closed
86 endTextBlock();
87 cleanUpNode(N_page, true);
88 //xmlAddChild(N_content, N_page);
89 //xmlSaveFormatFileEnc("pre-cut.xml", doc, "UTF-8", 1);
90 //xmlUnlinkNode(N_page);
91 //call the top down cutting mechanism
92 recursiveXYC(N_page);
93 //by stopping to worry about creating empty nodes I made the code quite a
94 //bit more robust. This function makes sure we have a nice'n'clean tree
95 cleanUpNode(N_page, true);
96 //xmlAddChild(N_content, N_page);
97 //xmlSaveFormatFileEnc("raw.xml", doc, "UTF-8", 1);
98 //xmlUnlinkNode(N_page);
99
100 //Interpret the XY tree and infer text blocks and columns
101 interpretXYTree();
102 cleanUpNode(N_page, true);
103 //xmlAddChild(N_content, N_page);
104 //xmlSaveFormatFileEnc("interpreted.xml", doc, "UTF-8", 1);
105 //xmlUnlinkNode(N_page);
106
107 //I have blocks and columns, this function will turn that into paragraphs and
108 //columns
109 generateParagraphs();
110 cleanUpNode(N_page, true);
111 xmlAddChild(N_content, N_page);
112 N_page = NULL;
113 }
114
recursiveXYC(xmlNodePtr nodeset)115 void ABWOutputDev::recursiveXYC(xmlNodePtr nodeset) {
116 /*This function implements the recursive XY Cut. basically, it gets
117 the largest piece of whitespace (using getBiggestSeperator()) and then
118 splits the page using splitNodes on that whitespace. It calls itself again
119 with both the halves*/
120 float bhs, bvs, X1, X2, Y1, Y2;
121
122 bvs = getBiggestSeperator(nodeset, VERTICAL, &X1, &X2);
123 bhs = getBiggestSeperator(nodeset, HORIZONTAL, &Y1, &Y2);
124
125 if (bvs == -1){
126 if (bhs == -1){//both -1
127 //FIXME: add assertions that bvs and bhs are >=-1
128 printf("No seperators\n");
129 return;
130 }
131 else { //only bhs > -1
132 splitNodes(Y1, HORIZONTAL, nodeset, bhs);
133 }
134 }
135 else {
136 if (bhs == -1){//only bvs > -1
137 splitNodes(X1, VERTICAL, nodeset, bvs);
138 }
139 else {//both > -1
140 if (bvs >= (bhs/1.7)){
141 //When people read a text they prefer vertical cuts over horizontal
142 //ones. I'm not that sure about the 1.7 value, but it seems to work.
143 splitNodes(X1, VERTICAL, nodeset, bvs);
144 }
145 else {
146 splitNodes(Y1, HORIZONTAL, nodeset, bhs);
147 }
148 }
149 }
150 recursiveXYC(nodeset->children);
151 recursiveXYC(nodeset->children->next);
152 }
153
splitNodes(float splitValue,unsigned int direction,xmlNodePtr N_parent,double seperator)154 void ABWOutputDev::splitNodes(float splitValue, unsigned int direction, xmlNodePtr N_parent, double seperator){
155 //This function takes a nodeset and splits it based on a cut value. It returns
156 //the nodePtr with two childnodes, the both chunks.
157 xmlNodePtr N_move, N_cur, N_newH, N_newL;
158 char * propName;
159 const char *nodeName;
160 char buf[20];
161 if (direction == HORIZONTAL) {
162 propName = "Y1";
163 nodeName = "horizontal";
164 }
165 else {
166 propName = "X1";
167 nodeName = "vertical";
168 }
169 N_newH = xmlNewNode(NULL, BAD_CAST nodeName);
170 N_newL = xmlNewNode(NULL, BAD_CAST nodeName);
171 sprintf(buf, "%f", seperator);
172 xmlNewProp(N_newH, BAD_CAST "diff", BAD_CAST buf);
173 sprintf(buf, "%f", seperator);
174 xmlNewProp(N_newL, BAD_CAST "diff", BAD_CAST buf);
175 N_cur = N_parent->children;
176 while (N_cur){
177 N_move = N_cur->next;
178 xmlUnlinkNode(N_cur);
179 if (xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST propName)) > splitValue){
180 xmlAddChild(N_newH, N_cur);
181 }
182 else {
183 xmlAddChild(N_newL, N_cur);
184 }
185 N_cur = N_move;
186 }
187 xmlAddChild(N_parent, N_newL);
188 xmlAddChild(N_parent, N_newH);
189 }
190
getBiggestSeperator(xmlNodePtr N_set,unsigned int direction,float * C1,float * C2)191 float ABWOutputDev::getBiggestSeperator(xmlNodePtr N_set, unsigned int direction, float * C1, float * C2)
192 {
193 int i = 0;
194 int nodeCount = xmlLsCountNode(N_set);
195 float store;
196 int min;
197 float gap, endV;
198 float * stt;
199 float * end;
200 if (nodeCount == 0){
201 //Add assertion that this shouldn't happen
202 fprintf(stderr,"No child nodes");
203 return -1;
204 }
205 stt = new float[nodeCount];
206 end = new float[nodeCount];
207 //store all variables in two arrays (one for start, one for end coordinates)
208 if (direction == VERTICAL) {
209 for (xmlNodePtr N_cur = N_set->children; N_cur != NULL; N_cur = N_cur->next){
210 stt[i] = xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "X1"));
211 end[i] = xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "X2"));
212 i++;
213 }
214 }
215 else {
216 for (xmlNodePtr N_cur = N_set->children; N_cur != NULL; N_cur = N_cur->next){
217 stt[i] = xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "Y1"));
218 end[i] = xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "Y2"));
219 i++;
220 }
221 }
222 //Sort them
223 for (i = 0; i < nodeCount - 1; i++){
224 min = i;
225 for (int j = i + 1; j < nodeCount; j++)
226 if (stt[j] < stt[i])
227 min = j;
228 store = stt[i];
229 stt[i] = stt[min];
230 stt[min] = store;
231 store = end[i];
232 end[i] = end[min];
233 end[min] = store;
234 }
235 //find the largest gap
236 gap = -1;
237 endV = end[0];
238 *C1 = 0;
239 *C2 = 0;
240 for (int inspect = 1; inspect < nodeCount; inspect++){
241 //no gap
242 if (((stt[inspect] - endV) - gap) < 0.5){ //FIXME:This is copied almost directly from the previous function, needs checking out
243 //partial overlap instead of complete one
244 if (end[inspect] > endV)
245 endV = end[inspect];
246 }
247 //gap
248 else{
249 //gap is larger than any previous gap
250 if (gap < (stt[inspect] - endV)){
251 gap = stt[inspect] - endV;
252 *C1 = endV;
253 *C2 = stt[inspect];
254 }
255 endV = end[inspect];
256 }
257 }
258 delete[] stt;
259 delete[] end;
260 return gap;
261 }
262
updateFont(GfxState * state)263 void ABWOutputDev::updateFont(GfxState *state) {
264 char buf[160];
265 xmlNodePtr N_cur;
266 GfxFont *font;
267 bool found = false;
268 bool isBold, isItalic, S_isBold, S_isItalic;
269 isBold = isItalic = S_isBold = S_isItalic = false;
270 font = state->getFont();
271 GooString *ftName;
272 char *fnEnd, *fnName;
273 int fnStart, ftSize;
274 //the first time this function is called there is no funt.
275 //Fixme: find out if that isn'y a bug
276 if (font){
277 isBold = (font->isBold() || font->getWeight() >6 || (strstr(font->getOrigName()->getCString(), "Bold")-font->getOrigName()->getCString() == (font->getOrigName()->getLength()-4)));
278 isItalic = (font->isItalic() || (strstr(font->getOrigName()->getCString(), "Italic")-font->getOrigName()->getCString() == (font->getOrigName()->getLength()-6)));
279 ftSize = int(state->getTransformedFontSize())-1;
280 ftName = new GooString(font->getOrigName());
281 fnStart = strcspn(ftName->getCString(), "+");
282 if (fnStart < ftName->getLength())
283 ftName->del(0,fnStart+1);
284 fnEnd = strrchr(ftName->getCString(), 44);
285 if (fnEnd == 0)
286 fnEnd = strrchr(ftName->getCString(), 45);
287 if (fnEnd != 0)
288 ftName->del(fnEnd-ftName->getCString(),ftName->getLength()-1);
289
290 /* fnName = ftName;
291 if (isBold or isItalic){
292 fnStart = strcspn(fnName, "+");
293 if (fnStart == font->getOrigName()->getLength())
294 fnStart = 0;
295 else fnStart++;
296
297 fnEnd = strstr(fnName, ",");
298 if (fnEnd == 0)
299 fnEnd = strstr(fnName, "-");
300 if (fnEnd != 0)
301 fnName[fnEnd-fnName] = 0;
302 // char fntName[fnLength];
303 // strncpy (fntName,fnName+fnStart+1,fnLength);
304 fnName+=fnStart;
305 // fnName = fntName;
306 }
307 else {*/
308 fnName = ftName->getCString();
309 // }
310 for (N_cur = N_styleset->children; N_cur; N_cur = N_cur ->next){
311 if (
312 isBold == (xmlStrcasecmp(xmlGetProp(N_cur,BAD_CAST "bold"),BAD_CAST "bold;") == 0)
313 &&
314 isItalic == (xmlStrcasecmp(xmlGetProp(N_cur,BAD_CAST "italic"),BAD_CAST "italic") == 0)
315 &&
316 xmlStrcasecmp(xmlGetProp(N_cur,BAD_CAST "font"),BAD_CAST fnName) == 0
317 &&
318 xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "size")) == ftSize
319 ) {
320 found = true;
321 Style = int(xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "id")));
322 }
323 }
324 if (!found){
325 N_cur = xmlNewChild(N_styleset, NULL, BAD_CAST "s", NULL);
326 xmlSetProp(N_cur, BAD_CAST "type", BAD_CAST "P");
327 sprintf(buf, "%d", maxStyle++);
328 xmlSetProp(N_cur, BAD_CAST "name", BAD_CAST buf);
329 xmlSetProp(N_cur, BAD_CAST "id", BAD_CAST buf);
330 Style = maxStyle;
331 sprintf(buf, "%d", ftSize); xmlSetProp(N_cur, BAD_CAST "size", BAD_CAST buf);
332 isBold ? xmlSetProp(N_cur, BAD_CAST "bold", BAD_CAST "bold;") : xmlSetProp(N_cur, BAD_CAST "bold", BAD_CAST "normal;");
333 isItalic ? xmlSetProp(N_cur, BAD_CAST "italic", BAD_CAST "italic"): xmlSetProp(N_cur, BAD_CAST "italic", BAD_CAST "normal");
334 xmlSetProp(N_cur, BAD_CAST "font", BAD_CAST fnName);
335 }
336 }
337 }
338
drawChar(GfxState * state,double x,double y,double dx,double dy,double originX,double originY,CharCode code,int nBytes,Unicode * u,int uLen)339 void ABWOutputDev::drawChar(GfxState *state, double x, double y,
340 double dx, double dy,
341 double originX, double originY,
342 CharCode code, int nBytes, Unicode *u, int uLen)
343 {
344 //I wouldn't know what size this should safely be. I guess 64 bytes should be
345 //enough for any unicode character
346 char buf[64];
347 int charLen;
348 x = dx;
349 y = dy;
350 //state->textTransformDelta(dx * state->getHorizScaling(), dy, &dx, &dy);
351 //state->transformDelta(dx, dy, &dx, &dy);
352 if (uLen == 1 && code == 0x20) {
353 //If we break a text sequence on space, then the X1 should be increased
354 //but the Y1 and Y2 should remain the same.
355 beginWord(state,X2+dx,Y2);
356 }
357 else {
358 X2 += dx;
359 Y2 += dy;
360 charLen = uMap->mapUnicode(*u,buf,sizeof(buf));
361 //Getting Unicode to libxml is something I need to fix.
362 //simply passing it using a bad-cast isn't working.
363 //I assume that CharCode code it the U+value of the unicode character
364 //But for a ligature code gives me DF which is the ringel-s, I guess
365 //code should be two bytes wide?
366 xmlNodeAddContentLen(N_word, BAD_CAST buf, charLen);
367 }
368 }
369
beginString(GfxState * state,GooString * s)370 void ABWOutputDev::beginString(GfxState *state, GooString *s) {
371 double x,y;
372 //state->textTransform(x, y, &x, &y);
373 state->transform(state->getCurX(), state->getCurY(), &x, &y);
374 if (N_word) {
375 verDist = y-Y2;
376 horDist = x-X2;
377 //TEST:changed fabs(horDist) to horDist
378 //FIXME: this if statement seems awkward to me.
379 if (horDist > (state->getTransformedFontSize()*maxWordSpacing) || (fabs(verDist) > (state->getTransformedFontSize()/maxLineSpacingDelta))) {
380 beginTextBlock(state,x,y);
381 }
382 else {
383 if ((horDist > (state->getTransformedFontSize()*minWordBreakSpace)) || (fabs(verDist) > (state->getTransformedFontSize()/maxLineSpacingDelta))) {
384 beginWord(state,x,y);
385 }
386 }
387 }
388 else {
389 //This is the first word. Clear all values and call beginWord;
390 X2 = x;
391 Y2 = y;
392 horDist = 0;
393 verDist = 0;
394 height = 0;
395 beginTextBlock(state,x,y);
396 }
397 }
398
endString(GfxState * state)399 void ABWOutputDev::endString(GfxState *state) {
400
401 }
402
beginWord(GfxState * state,double x,double y)403 void ABWOutputDev::beginWord(GfxState *state, double x, double y){
404 char buf[20];
405 // printf("***BREAK!***\n");
406 endWord();
407 X1 = x;
408 Y2 = y;
409
410 horDist = X1-X2;
411 verDist = Y1-Y2;
412
413 X2 = X1;
414 height = state->getFont()->getAscent() * state->getTransformedFontSize();
415 Y1 = Y2-height;
416
417 N_word = xmlNewChild(N_Block, NULL, BAD_CAST "word", NULL);
418 sprintf(buf, "%f", X1); xmlNewProp(N_word, BAD_CAST "X1", BAD_CAST buf);
419 sprintf(buf, "%f", Y1); xmlNewProp(N_word, BAD_CAST "Y1", BAD_CAST buf);
420 sprintf(buf, "%d", Style); xmlNewProp(N_word, BAD_CAST "style", BAD_CAST buf);
421 }
422
endWord()423 void ABWOutputDev::endWord(){
424 char buf[20];
425 if (N_word) {
426 sprintf(buf, "%f", X2); xmlNewProp(N_word, BAD_CAST "X2", BAD_CAST buf);
427 sprintf(buf, "%f", Y2); xmlNewProp(N_word, BAD_CAST "Y2", BAD_CAST buf);
428 sprintf(buf, "%f", X2-X1); xmlNewProp(N_word, BAD_CAST "width", BAD_CAST buf);
429 sprintf(buf, "%f", Y2-Y1); xmlNewProp(N_word, BAD_CAST "height", BAD_CAST buf);
430 N_word = NULL;
431 }
432 }
433
beginTextBlock(GfxState * state,double x,double y)434 void ABWOutputDev::beginTextBlock(GfxState *state, double x, double y){
435 endTextBlock();
436 N_Block = xmlNewChild(N_page, NULL, BAD_CAST "Textblock", NULL);
437 beginWord(state,x,y);
438 }
439
endTextBlock()440 void ABWOutputDev::endTextBlock(){
441 if (N_Block) {
442 endWord();
443 N_Block = NULL;
444 }
445 }
446 /*
447 This will be a function to retrieve coherent text blocks from the chunk tree.*/
interpretXYTree()448 void ABWOutputDev::interpretXYTree(){
449 xmlNodePtr N_oldPage;
450 N_oldPage = N_page;
451 N_page = xmlNewNode(NULL, BAD_CAST "page");
452 N_column = N_page;
453 //xmlAddChild(N_content, N_page);
454 N_Block = xmlNewChild(N_column, NULL, BAD_CAST "chunk", NULL);
455 ATP_recursive(N_oldPage);
456 }
457
ATP_recursive(xmlNodePtr N_parent)458 void ABWOutputDev::ATP_recursive(xmlNodePtr N_parent){
459 xmlNodePtr N_first, N_second, N_line, N_tempCol, N_tempColset = NULL;
460
461 N_first = N_parent->children;
462 if (!N_first)
463 return;
464
465 N_second = N_first->next;
466 /*
467 Possibilities:
468 there is one child node
469 Because we cleaned up before the only case where we allow one childnode is
470 within Textblocks and textBlocks within 'vertical' nodes.
471 basically one text node means: add it to the current block.
472 There are two childnodes
473 This can be two verticals, two horizontals or one horizontal and a text node.
474 verticals:
475 If the first is vertical, the second is as well.
476 verticals mean: create a new Block, add a column per vertical make the
477 vertical the block and recurse inside.
478 then make the second vertical the block and recurse inside
479 then finish the block (ie. create a new one)
480 horizontal and or Textblocks
481 if first is textnode
482 add first to block
483 if second is textnode
484 at to block
485 else
486 call again
487 else
488 begin new block
489 call again
490 begin new block
491 if second is text node
492 add to block
493 else
494 call again
495 there are more then two child nodes
496 this can be a number of Textblocks and horizontals
497 add the textNodes to the current Block
498 if a horizontal is encountered enter it and generate a new block afterwards
499 */
500 //fprintf(stderr,"**********************************************************************\n");
501 //xmlSaveFormatFileEnc("-", doc, "UTF-8", 1);
502 switch (xmlLsCountNode(N_parent)) {
503 case 1:
504 //fprintf(stderr,"case 1\n");
505 N_line = xmlNewChild(N_Block, NULL, BAD_CAST "line", NULL);
506 xmlUnlinkNode(N_first);
507 xmlAddChild(N_line, N_first);
508 break;
509 case 2:
510 //fprintf(stderr,"case 2\n");
511 if (xmlStrcasecmp(N_first->name,BAD_CAST "vertical") == 0){
512 //store the column for the moment
513 N_tempCol = N_column;
514 /*If we have three columns they will turn up in the tree as:
515 <vertical>
516 <vertical/>
517 <vertical/>
518 </vertical>
519 <vertical/>
520 */
521 //if the parent is a vertical as well, we can skip the colset generation
522 //thing here we can also remove the just added column and block, because
523 //these are going to replace them
524 if (xmlStrcasecmp(N_parent->name,BAD_CAST "vertical") != 0){
525 //fprintf(stderr,"first time column\n");
526 N_tempColset = N_colset;
527 N_colset = xmlNewChild(N_column, NULL, BAD_CAST "colset", NULL);
528 N_column = xmlNewChild(N_colset, NULL, BAD_CAST "column", NULL);
529 N_Block = xmlNewChild(N_column, NULL, BAD_CAST "chunk", NULL);
530 }
531 else {
532 //fprintf(stderr,"second time column\n");
533 xmlUnlinkNode(N_column);
534 N_column = xmlNewChild(N_colset, NULL, BAD_CAST "column", NULL);
535 N_Block = xmlNewChild(N_column, NULL, BAD_CAST "chunk", NULL);
536 }
537 //fprintf(stderr,"Building first column...\n");
538 ATP_recursive(N_first);
539 N_column = xmlNewChild(N_colset, NULL, BAD_CAST "column", NULL);
540 N_Block = xmlNewChild(N_column, NULL, BAD_CAST "chunk", NULL);
541 //fprintf(stderr,"Building second column...\n");
542 ATP_recursive(N_second);
543 //make sure we end the column by continuing in the master column and
544 //setting the block and line to it
545 N_column = N_tempCol;
546 if (xmlStrcasecmp(N_parent->name,BAD_CAST "vertical") != 0){
547 if (N_tempColset != NULL)
548 N_colset = N_tempColset;
549 else
550 fprintf(stderr,"N_templColset should not! be empty (line 823)");//FIXME: add assert
551 }
552 }
553 else {
554 if (xmlStrcasecmp(N_first->name,BAD_CAST "Textblock") == 0) {
555 //fprintf(stderr,"add first as textblock\n");
556 N_line = xmlNewChild(N_Block, NULL, BAD_CAST "line", NULL);
557 xmlUnlinkNode(N_first);
558 xmlAddChild(N_line, N_first);
559 if (xmlStrcasecmp(N_second->name,BAD_CAST "Textblock") == 0) {
560 //fprintf(stderr,"add second as textblock\n");
561 //FIXME: this is not neat. We should ignore the cut ignoring when there are only two elements above
562 //line aggregation doesn't work anyway atm.
563 xmlUnlinkNode(N_second);
564 xmlAddChild(N_line, N_second);
565 //We have two textChunks that are going to be added to the line.
566 //the following statements make the line wrap around both textblocks
567 //if the firstX1 is smaller then the second X1 use the first, else use the second etc.
568 }
569 else {
570 //fprintf(stderr,"recursing into second\n");
571 ATP_recursive(N_second);
572 }
573 }
574 else {
575 N_Block = xmlNewChild(N_column, NULL, BAD_CAST "chunk", NULL);
576 //fprintf(stderr,"recursing into first\n");
577 ATP_recursive(N_first);
578 N_Block = xmlNewChild(N_column, NULL, BAD_CAST "chunk", NULL);
579 if (xmlStrcasecmp(N_second->name,BAD_CAST "Textblock") == 0) {
580 //fprintf(stderr,"add second as textblock\n");
581 N_line = xmlNewChild(N_Block, NULL, BAD_CAST "line", NULL);
582 xmlUnlinkNode(N_second);
583 xmlAddChild(N_line, N_second);
584 }
585 else {
586 //fprintf(stderr,"recursing into second\n");
587 ATP_recursive(N_second);
588 }
589 }
590 }
591 break;
592 default:
593 //double tX1=0, tX2=0, tY1=0, tY2=0;
594 //fprintf(stderr,"case default\n");
595 N_line = xmlNewChild(N_Block, NULL, BAD_CAST "line", NULL);
596 while (N_first){
597 //xmlXPathCastStringToNumber(xmlGetProp(N_first,BAD_CAST "X1")) < tX1 ? tX1 = xmlXPathCastStringToNumber(xmlGetProp(N_first,BAD_CAST "X1")) : tX1 = tX1;
598 //xmlXPathCastStringToNumber(xmlGetProp(N_first,BAD_CAST "X2")) > tX2 ? tX2 = xmlXPathCastStringToNumber(xmlGetProp(N_first,BAD_CAST "X2")) : tX2 = tX2;
599 //xmlXPathCastStringToNumber(xmlGetProp(N_first,BAD_CAST "Y1")) < tY1 ? tY1 = xmlXPathCastStringToNumber(xmlGetProp(N_first,BAD_CAST "Y1")) : tY1 = tY1;
600 //xmlXPathCastStringToNumber(xmlGetProp(N_first,BAD_CAST "Y2")) > tY2 ? tY2 = xmlXPathCastStringToNumber(xmlGetProp(N_first,BAD_CAST "Y2")) : tY1 = tY2;
601 N_second = N_first->next;
602 if (xmlStrcasecmp(N_first->name,BAD_CAST "Textblock") == 0){
603 xmlUnlinkNode(N_first);
604 xmlAddChild(N_line, N_first);
605 }
606 else { //fprintf(stderr,"This shouldn't happen! (line 700)\n");
607 }
608 N_first = N_second;
609 }
610 break;
611 }
612 }
613
614 /*The cleanup function. It started out as a simple function to remove empty nodes
615 so that I could call xmladdnewchildnode as often as I liked so that I wouldn't get seg-faults
616 It is now a bit more advanced, makes sure the tree is as it's supposed to be and adds information too*/
cleanUpNode(xmlNodePtr N_parent,bool aggregateInfo)617 void ABWOutputDev::cleanUpNode(xmlNodePtr N_parent, bool aggregateInfo){
618 double tX1=-1, tX2=-1, tY1=-1, tY2=-1;
619 xmlNodePtr N_cur, N_next;
620 N_cur = N_parent->children;
621 char buf[20];
622 int prevStyle = -1;
623 xmlChar *val;
624 int styleLength = xmlLsCountNode(N_styleset)+1;
625 float stylePos;
626 int *styles = new int[styleLength];
627 for (int i=1; i< styleLength; i++) { styles[i] = 0;}
628 /*
629 ignore two horizontal nodes with textBlocks right underneath them. They
630 signal the end of a chunk, and the horizontal seperation needs to be
631 preserved, because it means they are different lines. The second horizontal
632 therefore needs to be kept.
633 */
634 if ((xmlLsCountNode(N_parent) == 2)
635 &&
636 xmlStrcasecmp(N_parent->name,BAD_CAST "horizontal") == 0
637 &&
638 N_cur
639 &&
640 N_cur->next
641 &&
642 xmlStrcasecmp(N_cur->name,BAD_CAST "horizontal") == 0 && xmlStrcasecmp(N_cur->next->name,BAD_CAST "horizontal") == 0
643 &&
644 xmlLsCountNode(N_cur) == 1 && xmlLsCountNode(N_cur->next) == 1
645 &&
646 xmlStrcasecmp(N_cur->children->name,BAD_CAST "Textblock") == 0 && xmlStrcasecmp(N_cur->next->children->name,BAD_CAST "Textblock") == 0
647 ) {
648 xmlAddPrevSibling(N_cur->next,N_cur->children);
649 xmlUnlinkNode(N_cur);
650 }
651 /*
652 This removes columns if one of the parts is actually a single letter.
653 I found out I liked the columns better, so I have the code commented out.
654 */
655 /* else if ((xmlLsCountNode(N_parent) == 2)
656 &&
657 N_cur
658 &&
659 N_cur->next
660 &&
661 xmlStrcasecmp(N_cur->name,BAD_CAST "vertical") == 0
662 &&
663 xmlStrcasecmp(N_cur->next->name,BAD_CAST "vertical") == 0
664 &&
665 (N_cur->children)
666 &&
667 (N_cur->children->children)
668 &&
669 (N_cur->children->children->children)
670 &&
671 xmlStrlen(N_cur->children->children->children->content) == 1) {
672 N_next = N_cur->next;
673 xmlAddChild(N_parent, N_next->children);
674 xmlAddPrevSibling(N_next->children->children, N_cur->children);
675 xmlUnlinkNode(N_cur);
676 xmlUnlinkNode(N_next);
677 } */else {
678 while (N_cur){
679 N_next = N_cur->next;
680 cleanUpNode(N_cur, aggregateInfo);
681 if (xmlLsCountNode(N_cur) == 0 && (xmlStrcasecmp(N_cur->name,BAD_CAST "cbr") != 0) && (xmlStrcasecmp(N_cur->name,BAD_CAST "s") != 0))
682 xmlUnlinkNode(N_cur);
683 //If the node is still around
684 N_cur = N_next;
685 }
686 }
687 //If a countainer element has only one child, it can be removed except for vertical
688 //cuts with only one textElement;
689 //the main reason for this code is to remove the crumbs after cleaning up in the loop above
690 if ((xmlLsCountNode(N_parent) == 1) && ((xmlStrcasecmp(N_parent->name,BAD_CAST "horizontal") == 0) || ((xmlStrcasecmp(N_parent->name,BAD_CAST "vertical") == 0) && (xmlStrcasecmp(N_parent->children->name,BAD_CAST "Textblock") != 0)))){
691 N_cur = N_parent->children;
692 xmlAddPrevSibling(N_parent,N_cur);
693 xmlUnlinkNode(N_parent);
694 }
695 //We cannot remove the page element so if it has only one childnode, we remove that childnode instead
696 if ((xmlStrcasecmp(N_parent->name,BAD_CAST "page") == 0) && (xmlLsCountNode(N_parent) == 1)) {
697 N_cur = N_parent->children->children;
698 while (N_cur){
699 N_next = N_cur->next;
700 xmlUnlinkNode(N_cur);
701 xmlAddChild(N_parent, N_cur);
702 N_cur = N_next;
703 }
704 xmlUnlinkNode(N_parent->children);
705 }
706 //Ok, so by this time the N_parent and his children are guaranteed to be clean
707 //this for loop gets information from the 'word' elements and propagates it up
708 //the tree.
709 if (aggregateInfo && xmlStrcasecmp(N_parent->name,BAD_CAST "word") != 0) {
710 for (N_cur = N_parent->children; N_cur; N_cur = N_cur->next){
711 val = xmlGetProp(N_cur,BAD_CAST "style");
712 stylePos = xmlXPathCastStringToNumber(val);
713 //fprintf(stderr,"1: %f, %d\n",stylePos,int(stylePos));
714 styles[int(stylePos)]=styles[int(stylePos)]+1;
715 //fprintf(stderr,"2: styles[%d] = %d\n",int(stylePos),styles[int(stylePos)]);
716 (xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "X1")) < tX1 || tX1 == -1)? tX1 = xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "X1")) : tX1 = tX1;
717 (xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "X2")) > tX2) ? tX2 = xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "X2")) : tX2 = tX2;
718 (xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "Y1")) < tY1 || tY1 == -1)? tY1 = xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "Y1")) : tY1 = tY1;
719 (xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "Y2")) > tY2) ? tY2 = xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "Y2")) : tY2 = tY2;
720 }
721 sprintf(buf, "%f", tX1); xmlSetProp(N_parent, BAD_CAST "X1", BAD_CAST buf);
722 sprintf(buf, "%f", tX2); xmlSetProp(N_parent, BAD_CAST "X2", BAD_CAST buf);
723 sprintf(buf, "%f", tY1); xmlSetProp(N_parent, BAD_CAST "Y1", BAD_CAST buf);
724 sprintf(buf, "%f", tY2); xmlSetProp(N_parent, BAD_CAST "Y2", BAD_CAST buf);
725 sprintf(buf, "%f", tX2-tX1); xmlSetProp(N_parent, BAD_CAST "width", BAD_CAST buf);
726 sprintf(buf, "%f", tY2-tY1); xmlSetProp(N_parent, BAD_CAST "height", BAD_CAST buf);
727 prevStyle = 0;
728 styles[0] = -1;
729 for (int i=1; i< styleLength; i++) { if (styles[i] > styles[prevStyle]) prevStyle = i; }
730 //fprintf(stderr,"%d\n", prevStyle);
731 if (prevStyle > 0){
732 sprintf(buf, "%d", prevStyle); xmlSetProp(N_parent, BAD_CAST "style", BAD_CAST buf);
733 }
734 }
735 if (N_parent->children && xmlStrcasecmp(N_parent->children->name,BAD_CAST "line") == 0 && xmlGetProp(N_parent->children,BAD_CAST "alignment") != NULL)
736 xmlSetProp(N_parent, BAD_CAST "alignment", xmlGetProp(N_parent->children,BAD_CAST "alignment"));
737
738 delete[] styles;
739 }
740
generateParagraphs()741 void ABWOutputDev::generateParagraphs() {
742 xmlNodePtr N_cur, N_parent, N_p, N_line, N_next;
743 int lvl;
744 //basically I first detect the text-alignment within blocks.
745 //ASSUMPTION: my block seperation thing is good enough so I don't need to
746 //worry about two alignments in one paragraph
747
748 X1 = 0;
749 X2 = pdfdoc->getPageCropWidth(G_pageNum);
750 Y1 = 0;
751 Y2 = pdfdoc->getPageCropHeight(G_pageNum);
752 addAlignment(N_page);
753
754 //then it's a switch per alignement
755 N_cur = N_page->children;
756 N_parent = N_page;
757 lvl = 1;
758 while (N_cur) {
759 if (xmlStrcasecmp(N_cur->name,BAD_CAST "chunk") == 0){
760 N_p = xmlNewNode(NULL, BAD_CAST "chunk");
761 xmlAddPrevSibling(N_cur,N_p);
762 //N_p = xmlNewChild(N_parent, NULL, BAD_CAST "chunk", NULL);
763 //A new paragraph is created when:
764 switch (int(xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "alignment")))){
765 //left
766 case 1: //the distance between the texblock X2 and the last word X2 is more than
767 //the following first word width.
768 N_line = N_cur->children;
769 while (N_line){
770 N_next = N_line->next;
771 xmlUnlinkNode(N_line);
772 xmlAddChild(N_p,N_line);
773 xmlSetProp(N_line, BAD_CAST "alignment", BAD_CAST "1");
774 if (N_next && xmlStrcasecmp(N_next->name,BAD_CAST "line") == 0){
775 if (xmlXPathCastStringToNumber(xmlGetProp(N_next->children->children,BAD_CAST "width")) < (xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "width")) - xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "width")))){
776 N_p = xmlNewNode(NULL, BAD_CAST "chunk");
777 xmlAddPrevSibling(N_cur,N_p);
778 }
779 }
780 N_line = N_next;
781 }
782 break;
783 //right
784 case 2: //the same but now with X1 and first word and following last word
785 N_line = N_cur->children;
786 while (N_line){
787 N_next = N_line->next;
788 xmlUnlinkNode(N_line);
789 xmlAddChild(N_p,N_line);
790 xmlSetProp(N_line, BAD_CAST "alignment", BAD_CAST "2");
791 if (N_next && xmlStrcasecmp(N_next->name,BAD_CAST "line") == 0){
792 //fprintf(stderr,"width_next=%f, X2_bl=%f, X2_w=%f\n",xmlXPathCastStringToNumber(xmlGetProp(N_next->children->children,BAD_CAST "width")),xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "width")),xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "width")));
793 if (xmlXPathCastStringToNumber(xmlGetProp(N_next->children->children,BAD_CAST "width")) < (xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "width")) - xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "width")))){
794 N_p = xmlNewNode(NULL, BAD_CAST "chunk");
795 xmlAddPrevSibling(N_cur,N_p);
796 }
797 }
798 N_line = N_next;
799 }
800 break;
801 //centered
802 case 3: //the combined left and right space is more than the following first word
803 N_line = N_cur->children;
804 while (N_line){
805 N_next = N_line->next;
806 xmlUnlinkNode(N_line);
807 xmlAddChild(N_p,N_line);
808 xmlSetProp(N_line, BAD_CAST "alignment", BAD_CAST "3");
809 if (N_next && xmlStrcasecmp(N_next->name,BAD_CAST "line") == 0){
810 //fprintf(stderr,"width_next=%f, X2_bl=%f, X2_w=%f\n",xmlXPathCastStringToNumber(xmlGetProp(N_next->children->children,BAD_CAST "width")),xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "width")),xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "width")));
811 if (xmlXPathCastStringToNumber(xmlGetProp(N_next->children->children,BAD_CAST "width")) < (xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "width")) - xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "width")))){
812 N_p = xmlNewNode(NULL, BAD_CAST "chunk");
813 xmlAddPrevSibling(N_cur,N_p);
814 }
815 }
816 N_line = N_next;
817 }
818 break;
819 //justified
820 case 4:
821 //we break on all alignment=1 lines. A line with alignment=1 that is the first of a block will
822 //also initiate a paragraph break before.
823 N_line = N_cur->children;
824 if (xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "alignment")) == 1){
825 N_p = xmlNewNode(NULL, BAD_CAST "chunk");
826 xmlAddPrevSibling(N_cur,N_p);
827 }
828 while (N_line){
829 N_next = N_line->next;
830 xmlUnlinkNode(N_line);
831 xmlAddChild(N_p,N_line);
832 if (xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "alignment")) == 1){
833 N_p = xmlNewNode(NULL, BAD_CAST "chunk");
834 xmlAddPrevSibling(N_cur,N_p);
835 }
836 xmlSetProp(N_line, BAD_CAST "alignment", BAD_CAST "4");
837 N_line = N_next;
838 }
839 break;
840 }
841 }
842 else if (xmlStrcasecmp(N_cur->name,BAD_CAST "colset") == 0 || xmlStrcasecmp(N_cur->name,BAD_CAST "column") == 0){
843 N_parent = N_cur;
844 N_cur = N_cur->children;
845 lvl++;
846 N_p = xmlNewNode(NULL, BAD_CAST "chunk");
847 xmlAddPrevSibling(N_cur,N_p);
848 continue;
849 }
850 if (N_cur->next)
851 N_cur = N_cur->next;
852 else while (lvl > 0){
853 N_cur = N_parent;
854 N_parent = N_cur->parent;
855 lvl--;
856 if (N_cur->next){
857 N_cur = N_cur->next;
858 break;
859 }
860 }
861 if (lvl==0)
862 N_cur = NULL;
863 }
864 }
865
866 //function that adds an 'alignment=' property to the <chunk>s
addAlignment(xmlNodePtr N_parent)867 void ABWOutputDev::addAlignment(xmlNodePtr N_parent) {
868 xmlNodePtr N_chunk, N_line;
869 double tX1, tX2;
870 bool leftMatch, rightMatch, centerMatch;
871 int leftCnt = 0, rightCnt = 0, cntrCnt = 0, justCnt = 0;
872 //fprintf(stderr,"Entering addAlignment\n");
873 for (N_chunk = N_parent->children; N_chunk; N_chunk = N_chunk->next) {
874 if (xmlStrcasecmp(N_chunk->name,BAD_CAST "chunk") == 0){
875 X1 = xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X1"));
876 X2 = xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X2"));
877 //fprintf(stderr,"Found chunk\n");
878 //if the chunk contains only one line, we don't need to loop through it.
879 if (xmlLsCountNode(N_chunk) == 1){
880 //fprintf(stderr,"Processing line\n");
881 //fprintf(stderr,"X1=%f, X2=%f, cX1=%f, cX2=%f\n",X1,X2,xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X1")), xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X2")));
882 //fprintf(stderr,"%f\n",(xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X1")) - X1)-(X2 - xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X2"))));
883 //fprintf(stderr,"cX1-X1=%f, X2-cX2=%f\n",(xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X1")) - X1),(X2 - xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X2"))));
884 // a one line chunk, is either centered or left or right-aligned.
885 if ((xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X1"))-X1)-(X2-xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X2"))) > 1) {
886 xmlNewProp(N_chunk, BAD_CAST "alignment", BAD_CAST "2");
887 xmlNewProp(N_chunk->children, BAD_CAST "alignment", BAD_CAST "2");
888 //fprintf(stderr,"alignment = right\n");
889 }
890 else {
891 if ((xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X1"))-X1)-(X2 - xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X2")))< -1) {
892 xmlNewProp(N_chunk, BAD_CAST "alignment", BAD_CAST "1");
893 xmlNewProp(N_chunk->children, BAD_CAST "alignment", BAD_CAST "1");
894 //fprintf(stderr,"alignment = left\n");
895 }
896 else {
897 xmlNewProp(N_chunk, BAD_CAST "alignment", BAD_CAST "3");
898 xmlNewProp(N_chunk->children, BAD_CAST "alignment", BAD_CAST "3");
899 //fprintf(stderr,"alignment = center\n");
900 }
901 }
902 }
903 else {
904 leftCnt = 0;
905 rightCnt = 0;
906 cntrCnt = 0;
907 justCnt = 0;
908 for (N_line = N_chunk->children; N_line; N_line = N_line->next) {
909 //fprintf(stderr,"Processing line\n");
910 /*
911 |X1 - cX1| == 1
912 |X2 - cX2| == 1
913 |(cX1-X1)-(X2-cX2)| == 1
914 ok, each line can be just as wide as the current set,
915 it can be smaller and moved to the right
916 it can be smaller and moved to the left.
917 it can
918 */
919 //fprintf(stderr,"X1=%f, X2=%f, cX1=%f, cX2=%f\n",X1,X2,xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "X1")), xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "X2")));
920 //fprintf(stderr,"cX1-X1=%f, X2-cX2=%f\n",(xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "X1")) - X1),(X2 - xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "X2"))));
921 leftMatch = fabs(xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "X1"))-X1) < 2;
922 rightMatch = fabs(X2-xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "X2"))) < 2;
923 centerMatch = fabs((xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "X1"))-X1)-(X2-xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "X2")))) < 2;
924 if (leftMatch && rightMatch) {
925 xmlNewProp(N_line, BAD_CAST "alignment", BAD_CAST "4");
926 justCnt++;
927 }
928 else if (centerMatch) {
929 xmlNewProp(N_line, BAD_CAST "alignment", BAD_CAST "3");
930 cntrCnt++;
931 }
932 else if (rightMatch) {
933 xmlNewProp(N_line, BAD_CAST "alignment", BAD_CAST "2");
934 rightCnt++;
935 }
936 else {
937 xmlNewProp(N_line, BAD_CAST "alignment", BAD_CAST "1");
938 leftCnt++;
939 }
940 }
941 //there is almost always one justified line in a centered text
942 //and most justified blocks have at least one left aligned line
943 //fprintf(stderr,"1:%d ,2:%d ,3:%d ,4:%d\n",leftCnt,justCnt,cntrCnt,rightCnt);
944 if ((leftCnt-1 >= justCnt) && (leftCnt >= rightCnt) && (leftCnt >= cntrCnt))
945 xmlNewProp(N_chunk, BAD_CAST "alignment", BAD_CAST "1");
946 else if ((justCnt >= leftCnt-1) && (justCnt >= rightCnt) && (justCnt >= cntrCnt))
947 xmlNewProp(N_chunk, BAD_CAST "alignment", BAD_CAST "4");
948 else if ((cntrCnt >= justCnt-1) && (cntrCnt >= rightCnt) && (cntrCnt >= leftCnt))
949 xmlNewProp(N_chunk, BAD_CAST "alignment", BAD_CAST "3");
950 else
951 xmlNewProp(N_chunk, BAD_CAST "alignment", BAD_CAST "2");
952 }
953 }
954 else {
955 if (xmlStrcasecmp(N_chunk->name,BAD_CAST "colset") == 0){
956 //fprintf(stderr,"Found a colset\n");
957 addAlignment(N_chunk);
958 }
959 else {
960 if (xmlStrcasecmp(N_chunk->name,BAD_CAST "column") == 0){
961 //fprintf(stderr,"Found a column\n");
962 tX1 = X1;
963 tX2 = X2;
964 X1 = xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X1"));
965 X2 = xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X2"));
966 addAlignment(N_chunk);
967 X1 = tX1;
968 X2 = tX2;
969 }
970 else { //fprintf(stderr,"Found something else\n");
971 }
972 }
973 }
974 }
975 //parse all blocks, and all lines within all blocks
976 //do a set of checks and tick a flag if the check fails
977 //check for line X1 is textBlock X1
978 //check for line X2 is textblock X2
979 //check if line is centered in textBock (LX1 != TX1 && LX2 != TX2 && LX1-TX1 == TX2=LX2)
980 //if the LX1 != TX1 then how much is the difference?
981 //a line isn't left aligned if all lines have a different X1 <= not so strong assumption.
982
983 //justified if both are straight except for a couple of (same factor sized) indents at the left
984 //else centered if above calculation is correct
985 //else left aligned if left side is more straight than right (more lines in the same X1 or common factor
986 //else right
987 }
988
setPDFDoc(PDFDoc * priv_pdfdoc)989 void ABWOutputDev::setPDFDoc(PDFDoc *priv_pdfdoc) {
990 pdfdoc = priv_pdfdoc;
991 }
992
createABW()993 void ABWOutputDev::createABW() {
994 //*************************************************************
995 //change styles to abiword format
996 xmlNodePtr N_cur, N_next;
997 xmlAttrPtr N_prop;
998 char buf[500];
999 for (N_cur = N_styleset->children; N_cur; N_cur = N_cur->next){
1000 sprintf(buf,"margin-top:0pt; color:000000; margin-left:0pt; text-position:normal; widows:2; text-indent:0in; font-variant:normal; margin-right:0pt; lang:nl-NL; line-height:1.0; font-size:%dpt; text-decoration:none; margin-bottom:0pt; bgcolor:transparent; text-align:left; font-stretch:normal;",int(xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "size"))));
1001 strncat(buf,"font-family:",12);
1002 strncat(buf,(char *)xmlGetProp(N_cur,BAD_CAST "font"),strlen((char *)xmlGetProp(N_cur,BAD_CAST "font")));
1003 strncat(buf,";",1);
1004 strncat(buf,"font-weight:",12);
1005 strncat(buf,(char *)xmlGetProp(N_cur,BAD_CAST "bold"),strlen((char *)xmlGetProp(N_cur,BAD_CAST "bold")));
1006 strncat(buf,"font-style:",12);
1007 strncat(buf,(char *)xmlGetProp(N_cur,BAD_CAST "italic"),strlen((char *)xmlGetProp(N_cur,BAD_CAST "italic")));
1008 xmlSetProp(N_cur, BAD_CAST "props", BAD_CAST buf);
1009 N_prop = xmlHasProp(N_cur, BAD_CAST "id");
1010 if (N_prop != NULL) xmlRemoveProp(N_prop);
1011 N_prop = xmlHasProp(N_cur, BAD_CAST "size");
1012 if (N_prop != NULL) xmlRemoveProp(N_prop);
1013 N_prop = xmlHasProp(N_cur, BAD_CAST "bold");
1014 if (N_prop != NULL) xmlRemoveProp(N_prop);
1015 N_prop = xmlHasProp(N_cur, BAD_CAST "italic");
1016 if (N_prop != NULL) xmlRemoveProp(N_prop);
1017 N_prop = xmlHasProp(N_cur, BAD_CAST "font");
1018 if (N_prop != NULL) xmlRemoveProp(N_prop);
1019 }
1020 //*************************************************************
1021 //Change the rest of the document
1022 //each child of N_content is a page
1023 N_cur = N_content->children;
1024 while (N_cur){
1025 //we creat a section node and attach it to the root, it will com after all
1026 //the page nodes. Then we transform the page, and finally remove it
1027 N_next = N_cur->next;
1028 //fprintf(stderr,"***Transforming page\n");
1029 N_Block = xmlNewChild(N_root, NULL, BAD_CAST "section", NULL);
1030 transformPage(N_cur);
1031 xmlUnlinkNode(N_cur);
1032 //fprintf(stderr,"***Finished transforming page\n");
1033 N_cur = N_next;
1034 }
1035 cleanUpNode(N_root, false);
1036 }
1037
transformPage(xmlNodePtr N_parent)1038 void ABWOutputDev::transformPage(xmlNodePtr N_parent){
1039 char buf[60];
1040 xmlNodePtr N_cur, N_curLine, N_curText, N_curWord, text, space;
1041 //translate the nodes into abiword nodes
1042 if (xmlStrcasecmp(N_parent->name,BAD_CAST "page") == 0){
1043 for (N_cur = N_parent->children; N_cur; N_cur = N_cur->next){
1044 //fprintf(stderr,"**pass a page child\n");
1045 transformPage(N_cur);
1046 }
1047 }
1048 if (xmlStrcasecmp(N_parent->name,BAD_CAST "chunk") == 0){
1049 //fprintf(stderr,"Found a chunk\n");
1050 //I start a <p> on each chunk and add all word containment
1051 N_text = xmlNewChild(N_Block, NULL, BAD_CAST "p", NULL);
1052 if (int(xmlXPathCastStringToNumber(xmlGetProp(N_parent,BAD_CAST "style"))) > 0){
1053 xmlNewProp(N_text, BAD_CAST "style", xmlGetProp(N_parent,BAD_CAST "style"));
1054 }
1055 switch (int(xmlXPathCastStringToNumber(xmlGetProp(N_parent,BAD_CAST "alignment")))){
1056 case 1: xmlNewProp(N_text, BAD_CAST "props", BAD_CAST "text-align:left");
1057 break;
1058 case 2: xmlNewProp(N_text, BAD_CAST "props", BAD_CAST "text-align:right");
1059 break;
1060 case 3: xmlNewProp(N_text, BAD_CAST "props", BAD_CAST "text-align:center");
1061 break;
1062 case 4: xmlNewProp(N_text, BAD_CAST "props", BAD_CAST "text-align:justify");
1063 break;
1064 }
1065 for (N_curLine = N_parent->children; N_curLine; N_curLine = N_curLine->next){
1066 //fprintf(stderr,"A line\n");
1067 for (N_curText = N_curLine->children; N_curText; N_curText = N_curText->next){
1068 //fprintf(stderr,"a textNode\n");
1069 for (N_curWord = N_curText->children; N_curWord; N_curWord = N_curWord->next){
1070 //fprintf(stderr,"a word\n");
1071 text = N_curWord->children;
1072 xmlUnlinkNode(text);
1073 xmlAddChild(N_text,text);
1074 space = xmlNewText(BAD_CAST " ");
1075 xmlAddChild(N_text,space);
1076 }
1077 }
1078 }
1079 }
1080 if (xmlStrcasecmp(N_parent->name,BAD_CAST "column") == 0){
1081 //fprintf(stderr,"Found a column\n");
1082 for (N_cur = N_parent->children; N_cur; N_cur = N_cur->next){
1083 transformPage(N_cur);
1084 }
1085 xmlNewChild(N_text, NULL, BAD_CAST "cbr", NULL);
1086 }
1087 if (xmlStrcasecmp(N_parent->name,BAD_CAST "colset") == 0){
1088 //fprintf(stderr,"Found a colset\n");
1089 //create new section columns: count childNodes of N_cur
1090 //recurse through chunks and create textNodes
1091 N_Block = xmlNewChild(N_root, NULL, BAD_CAST "section", NULL);
1092 sprintf(buf,"columns:%d",xmlLsCountNode(N_parent));
1093 xmlNewProp(N_Block, BAD_CAST "props", BAD_CAST buf);
1094 for (N_cur = N_parent->children; N_cur; N_cur = N_cur->next){
1095 transformPage(N_cur);
1096 }
1097 N_Block = xmlNewChild(N_root, NULL, BAD_CAST "section", NULL);
1098 }
1099 //fprintf(stderr,"at the end\n");
1100 }
1101
1102 //Count nodes, copied from debugxml.c from libxml
1103 // libxml copyright file below
1104 /*
1105 Except where otherwise noted in the source code (e.g. the files hash.c,
1106 list.c and the trio files, which are covered by a similar licence but
1107 with different Copyright notices) all the files are:
1108
1109 Copyright (C) 1998-2003 Daniel Veillard. All Rights Reserved.
1110
1111 Permission is hereby granted, free of charge, to any person obtaining a copy
1112 of this software and associated documentation files (the "Software"), to deal
1113 in the Software without restriction, including without limitation the rights
1114 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
1115 copies of the Software, and to permit persons to whom the Software is fur-
1116 nished to do so, subject to the following conditions:
1117
1118 The above copyright notice and this permission notice shall be included in
1119 all copies or substantial portions of the Software.
1120
1121 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1122 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FIT-
1123 NESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
1124 DANIEL VEILLARD BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
1125 IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CON-
1126 NECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
1127
1128 Except as contained in this notice, the name of Daniel Veillard shall not
1129 be used in advertising or otherwise to promote the sale, use or other deal-
1130 ings in this Software without prior written authorization from him.
1131 */
xmlLsCountNode(xmlNodePtr node)1132 int ABWOutputDev::xmlLsCountNode(xmlNodePtr node) {
1133 int ret = 0;
1134 xmlNodePtr list = NULL;
1135
1136 if (node == NULL)
1137 return(0);
1138
1139 switch (node->type) {
1140 case XML_ELEMENT_NODE:
1141 list = node->children;
1142 break;
1143 case XML_DOCUMENT_NODE:
1144 case XML_HTML_DOCUMENT_NODE:
1145 #ifdef LIBXML_DOCB_ENABLED
1146 case XML_DOCB_DOCUMENT_NODE:
1147 #endif
1148 list = ((xmlDocPtr) node)->children;
1149 break;
1150 case XML_ATTRIBUTE_NODE:
1151 list = ((xmlAttrPtr) node)->children;
1152 break;
1153 case XML_TEXT_NODE:
1154 case XML_CDATA_SECTION_NODE:
1155 case XML_PI_NODE:
1156 case XML_COMMENT_NODE:
1157 if (node->content != NULL) {
1158 ret = xmlStrlen(node->content);
1159 }
1160 break;
1161 case XML_ENTITY_REF_NODE:
1162 case XML_DOCUMENT_TYPE_NODE:
1163 case XML_ENTITY_NODE:
1164 case XML_DOCUMENT_FRAG_NODE:
1165 case XML_NOTATION_NODE:
1166 case XML_DTD_NODE:
1167 case XML_ELEMENT_DECL:
1168 case XML_ATTRIBUTE_DECL:
1169 case XML_ENTITY_DECL:
1170 case XML_NAMESPACE_DECL:
1171 case XML_XINCLUDE_START:
1172 case XML_XINCLUDE_END:
1173 ret = 1;
1174 break;
1175 }
1176 for (;list != NULL;ret++)
1177 list = list->next;
1178 return(ret);
1179 }
1180