1 //========================================================================
2 //
3 // ABWOutputDev.cc
4 //
5 // Copyright 2006-2007 Jauco Noordzij <jauco@jauco.nl>
6 // Copyright 2007 Dominic Lachowicz <cinamod@hotmail.com>
7 // Copyright 2008 Hib Eris <hib@hiberis.nl>
8 //
9 // Based somewhat on HtmlOutputDev.cc
10 //
11 //========================================================================
12 
13 #ifdef __GNUC__
14 #pragma implementation
15 #endif
16 
17 #include "config.h"
18 #include <stdio.h>
19 #include <stdlib.h>
20 #include <stdarg.h>
21 #include <stddef.h>
22 #include <ctype.h>
23 #include <math.h>
24 #include "goo/GooString.h"
25 #include "goo/GooList.h"
26 #include "UnicodeMap.h"
27 #include "goo/gmem.h"
28 #include "Error.h"
29 #include "GfxState.h"
30 #include "GlobalParams.h"
31 #include "ABWOutputDev.h"
32 #include "PDFDoc.h"
33 
34 #include <libxml/parser.h>
35 #include <libxml/tree.h>
36 #include <libxml/xpath.h>
37 #include <libxml/xpathInternals.h>
38 
39 
40 // Inter-character space width which will cause addChar to start a new
41 // word.
42 #define minWordBreakSpace 0.1
43 
44 // Maximum inter-word spacing, as a fraction of the font size.
45 #define maxWordSpacing 1.5
46 
47 // Max distance between baselines of two lines within a block, as a
48 // fraction of the font size.
49 #define maxLineSpacingDelta 1.5
50 
51 #define C_maxVCutValue 4
52 #define C_maxHCutValue 5
53 //------------------------------------------------------------------------
54 // ABWOutputDev
55 //------------------------------------------------------------------------
56 
ABWOutputDev(xmlDocPtr ext_doc)57 ABWOutputDev::ABWOutputDev(xmlDocPtr ext_doc)
58 {
59   pdfdoc = NULL;
60   N_page = N_style = N_text = N_styleset = N_Block = N_word = NULL;
61   doc = ext_doc;
62   N_root = xmlNewNode(NULL, BAD_CAST "abiword");
63   xmlDocSetRootElement(doc, N_root);
64   N_styleset = xmlNewChild(N_root, NULL, BAD_CAST "styles", NULL);
65   N_content = xmlNewChild(N_root, NULL, BAD_CAST "content", NULL);
66   uMap = globalParams->getTextEncoding();
67   maxStyle = Style = 1;
68 }
69 
~ABWOutputDev()70 ABWOutputDev::~ABWOutputDev() {
71   xmlCleanupParser();
72 }
73 
startPage(int pageNum,GfxState * state)74 void ABWOutputDev::startPage(int pageNum, GfxState *state) {
75   /*While reading a pdf page this node acts as a placeholder parent.
76   when conversion is finished and the page is structured as we like it
77   all text fragments are moved from N_page to N_content.*/
78   N_page = xmlNewNode(NULL, BAD_CAST "page");
79   G_pageNum = pageNum;
80 }
81 
82 /*Callback to denote that poppler reached the end of a page
83 here I insert most of the interesting processing stuff*/
endPage()84 void ABWOutputDev::endPage() {
85   //make sure all words are closed
86   endTextBlock();
87   cleanUpNode(N_page, true);
88   //xmlAddChild(N_content, N_page);
89   //xmlSaveFormatFileEnc("pre-cut.xml", doc, "UTF-8", 1);
90   //xmlUnlinkNode(N_page);
91   //call the top down cutting mechanism
92   recursiveXYC(N_page);
93   //by stopping to worry about creating empty nodes I made the code quite a
94   //bit more robust. This function makes sure we have a nice'n'clean tree
95   cleanUpNode(N_page, true);
96   //xmlAddChild(N_content, N_page);
97   //xmlSaveFormatFileEnc("raw.xml", doc, "UTF-8", 1);
98   //xmlUnlinkNode(N_page);
99 
100   //Interpret the XY tree and infer text blocks and columns
101   interpretXYTree();
102   cleanUpNode(N_page, true);
103   //xmlAddChild(N_content, N_page);
104   //xmlSaveFormatFileEnc("interpreted.xml", doc, "UTF-8", 1);
105   //xmlUnlinkNode(N_page);
106 
107   //I have blocks and columns, this function will turn that into paragraphs and
108   //columns
109   generateParagraphs();
110   cleanUpNode(N_page, true);
111   xmlAddChild(N_content, N_page);
112   N_page = NULL;
113 }
114 
recursiveXYC(xmlNodePtr nodeset)115 void ABWOutputDev::recursiveXYC(xmlNodePtr nodeset) {
116   /*This function implements the recursive XY Cut. basically, it gets
117   the largest piece of whitespace (using getBiggestSeperator()) and then
118   splits the page using splitNodes on that whitespace. It calls itself again
119   with both the halves*/
120   float bhs, bvs, X1, X2, Y1, Y2;
121 
122   bvs = getBiggestSeperator(nodeset, VERTICAL, &X1, &X2);
123   bhs = getBiggestSeperator(nodeset, HORIZONTAL, &Y1, &Y2);
124 
125   if (bvs == -1){
126     if (bhs == -1){//both -1
127       //FIXME: add assertions that bvs and bhs are >=-1
128       printf("No seperators\n");
129       return;
130     }
131     else { //only bhs > -1
132       splitNodes(Y1, HORIZONTAL, nodeset, bhs);
133     }
134   }
135   else {
136     if (bhs == -1){//only bvs > -1
137       splitNodes(X1, VERTICAL, nodeset, bvs);
138     }
139     else {//both > -1
140       if (bvs >= (bhs/1.7)){
141         //When people read a text they prefer vertical cuts over horizontal
142         //ones. I'm not that sure about the 1.7 value, but it seems to work.
143         splitNodes(X1, VERTICAL, nodeset, bvs);
144       }
145       else {
146         splitNodes(Y1, HORIZONTAL, nodeset, bhs);
147       }
148     }
149   }
150   recursiveXYC(nodeset->children);
151   recursiveXYC(nodeset->children->next);
152 }
153 
splitNodes(float splitValue,unsigned int direction,xmlNodePtr N_parent,double seperator)154 void ABWOutputDev::splitNodes(float splitValue, unsigned int direction, xmlNodePtr N_parent, double seperator){
155   //This function takes a nodeset and splits it based on a cut value. It returns
156   //the nodePtr with two childnodes, the both chunks.
157   xmlNodePtr N_move, N_cur, N_newH, N_newL;
158   char * propName;
159   const char *nodeName;
160   char buf[20];
161   if (direction == HORIZONTAL) {
162     propName = "Y1";
163     nodeName = "horizontal";
164   }
165   else {
166     propName = "X1";
167     nodeName = "vertical";
168   }
169   N_newH = xmlNewNode(NULL, BAD_CAST nodeName);
170   N_newL = xmlNewNode(NULL, BAD_CAST nodeName);
171   sprintf(buf, "%f", seperator);
172   xmlNewProp(N_newH, BAD_CAST "diff", BAD_CAST buf);
173   sprintf(buf, "%f", seperator);
174   xmlNewProp(N_newL, BAD_CAST "diff", BAD_CAST buf);
175   N_cur = N_parent->children;
176   while (N_cur){
177     N_move = N_cur->next;
178     xmlUnlinkNode(N_cur);
179     if (xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST propName)) > splitValue){
180       xmlAddChild(N_newH, N_cur);
181     }
182     else {
183       xmlAddChild(N_newL, N_cur);
184     }
185     N_cur = N_move;
186   }
187   xmlAddChild(N_parent, N_newL);
188   xmlAddChild(N_parent, N_newH);
189 }
190 
getBiggestSeperator(xmlNodePtr N_set,unsigned int direction,float * C1,float * C2)191 float ABWOutputDev::getBiggestSeperator(xmlNodePtr N_set, unsigned int direction, float * C1, float * C2)
192 {
193   int i = 0;
194   int nodeCount = xmlLsCountNode(N_set);
195   float store;
196   int min;
197   float gap, endV;
198   float * stt;
199   float * end;
200   if (nodeCount == 0){
201     //Add assertion that this shouldn't happen
202     fprintf(stderr,"No child nodes");
203     return -1;
204   }
205   stt = new float[nodeCount];
206   end = new float[nodeCount];
207   //store all variables in two arrays (one for start, one for end coordinates)
208   if (direction == VERTICAL) {
209     for (xmlNodePtr N_cur = N_set->children; N_cur != NULL; N_cur = N_cur->next){
210       stt[i] = xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "X1"));
211       end[i] = xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "X2"));
212       i++;
213     }
214   }
215   else {
216     for (xmlNodePtr N_cur = N_set->children; N_cur != NULL; N_cur = N_cur->next){
217       stt[i] = xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "Y1"));
218       end[i] = xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "Y2"));
219       i++;
220     }
221   }
222   //Sort them
223   for (i = 0; i < nodeCount - 1; i++){
224     min = i;
225     for (int j = i + 1; j < nodeCount; j++)
226       if (stt[j] < stt[i])
227         min = j;
228     store = stt[i];
229     stt[i] = stt[min];
230     stt[min] = store;
231     store = end[i];
232     end[i] = end[min];
233     end[min] = store;
234   }
235   //find the largest gap
236   gap = -1;
237   endV = end[0];
238   *C1 = 0;
239   *C2 = 0;
240   for (int inspect = 1; inspect < nodeCount; inspect++){
241     //no gap
242     if (((stt[inspect] - endV) - gap) < 0.5){ //FIXME:This is copied almost directly from the previous function, needs checking out
243       //partial overlap instead of complete one
244       if (end[inspect] > endV)
245         endV = end[inspect];
246     }
247     //gap
248     else{
249       //gap is larger than any previous gap
250       if (gap < (stt[inspect] - endV)){
251         gap = stt[inspect] - endV;
252         *C1 = endV;
253         *C2 = stt[inspect];
254       }
255       endV = end[inspect];
256     }
257   }
258   delete[] stt;
259   delete[] end;
260   return gap;
261 }
262 
updateFont(GfxState * state)263 void ABWOutputDev::updateFont(GfxState *state) {
264   char buf[160];
265   xmlNodePtr N_cur;
266   GfxFont *font;
267   bool found = false;
268   bool isBold, isItalic, S_isBold, S_isItalic;
269   isBold = isItalic = S_isBold =  S_isItalic = false;
270   font = state->getFont();
271   GooString *ftName;
272   char *fnEnd, *fnName;
273   int fnStart, ftSize;
274   //the first time this function is called there is no funt.
275   //Fixme: find out if that isn'y a bug
276   if (font){
277     isBold = (font->isBold() || font->getWeight() >6 || (strstr(font->getOrigName()->getCString(), "Bold")-font->getOrigName()->getCString() == (font->getOrigName()->getLength()-4)));
278     isItalic =  (font->isItalic() || (strstr(font->getOrigName()->getCString(), "Italic")-font->getOrigName()->getCString() == (font->getOrigName()->getLength()-6)));
279     ftSize = int(state->getTransformedFontSize())-1;
280     ftName = new GooString(font->getOrigName());
281     fnStart = strcspn(ftName->getCString(), "+");
282     if (fnStart < ftName->getLength())
283       ftName->del(0,fnStart+1);
284     fnEnd = strrchr(ftName->getCString(), 44);
285     if (fnEnd == 0)
286       fnEnd = strrchr(ftName->getCString(), 45);
287     if (fnEnd != 0)
288       ftName->del(fnEnd-ftName->getCString(),ftName->getLength()-1);
289 
290 /*    fnName = ftName;
291     if (isBold or isItalic){
292       fnStart = strcspn(fnName, "+");
293       if (fnStart == font->getOrigName()->getLength())
294         fnStart = 0;
295       else fnStart++;
296 
297       fnEnd = strstr(fnName, ",");
298       if (fnEnd == 0)
299         fnEnd = strstr(fnName, "-");
300       if (fnEnd != 0)
301         fnName[fnEnd-fnName] = 0;
302 //      char fntName[fnLength];
303 //      strncpy (fntName,fnName+fnStart+1,fnLength);
304       fnName+=fnStart;
305 //      fnName = fntName;
306     }
307     else {*/
308       fnName = ftName->getCString();
309 //    }
310     for (N_cur = N_styleset->children; N_cur; N_cur = N_cur ->next){
311       if (
312        isBold == (xmlStrcasecmp(xmlGetProp(N_cur,BAD_CAST "bold"),BAD_CAST "bold;") == 0)
313        &&
314        isItalic == (xmlStrcasecmp(xmlGetProp(N_cur,BAD_CAST "italic"),BAD_CAST "italic") == 0)
315        &&
316        xmlStrcasecmp(xmlGetProp(N_cur,BAD_CAST "font"),BAD_CAST fnName) == 0
317        &&
318        xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "size")) == ftSize
319       ) {
320         found = true;
321         Style = int(xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "id")));
322       }
323     }
324     if (!found){
325       N_cur = xmlNewChild(N_styleset, NULL, BAD_CAST "s", NULL);
326       xmlSetProp(N_cur, BAD_CAST "type", BAD_CAST "P");
327       sprintf(buf, "%d", maxStyle++);
328       xmlSetProp(N_cur, BAD_CAST "name", BAD_CAST buf);
329       xmlSetProp(N_cur, BAD_CAST "id", BAD_CAST buf);
330       Style = maxStyle;
331       sprintf(buf, "%d", ftSize); xmlSetProp(N_cur, BAD_CAST "size", BAD_CAST buf);
332       isBold   ? xmlSetProp(N_cur, BAD_CAST "bold", BAD_CAST "bold;")  : xmlSetProp(N_cur, BAD_CAST "bold", BAD_CAST "normal;");
333       isItalic ? xmlSetProp(N_cur, BAD_CAST "italic", BAD_CAST "italic"): xmlSetProp(N_cur, BAD_CAST "italic", BAD_CAST "normal");
334       xmlSetProp(N_cur, BAD_CAST "font", BAD_CAST fnName);
335     }
336   }
337 }
338 
drawChar(GfxState * state,double x,double y,double dx,double dy,double originX,double originY,CharCode code,int nBytes,Unicode * u,int uLen)339 void ABWOutputDev::drawChar(GfxState *state, double x, double y,
340 			double dx, double dy,
341 			double originX, double originY,
342 			CharCode code, int nBytes, Unicode *u, int uLen)
343 {
344   //I wouldn't know what size this should safely be. I guess 64 bytes should be
345   //enough for any unicode character
346   char buf[64];
347   int charLen;
348   x = dx;
349   y = dy;
350   //state->textTransformDelta(dx * state->getHorizScaling(), dy, &dx, &dy);
351   //state->transformDelta(dx, dy, &dx, &dy);
352   if (uLen == 1 && code == 0x20) {
353     //If we break a text sequence on space, then the X1 should be increased
354     //but the Y1 and Y2 should remain the same.
355     beginWord(state,X2+dx,Y2);
356   }
357   else {
358     X2    += dx;
359     Y2    += dy;
360     charLen = uMap->mapUnicode(*u,buf,sizeof(buf));
361     //Getting Unicode to libxml is something I need to fix.
362     //simply passing it using a bad-cast isn't working.
363     //I assume that CharCode code it the U+value of the unicode character
364     //But for a ligature code gives me DF which is the ringel-s, I guess
365     //code should be two bytes wide?
366     xmlNodeAddContentLen(N_word, BAD_CAST buf, charLen);
367   }
368 }
369 
beginString(GfxState * state,GooString * s)370 void ABWOutputDev::beginString(GfxState *state, GooString *s) {
371   double x,y;
372   //state->textTransform(x, y, &x, &y);
373   state->transform(state->getCurX(), state->getCurY(), &x, &y);
374   if (N_word) {
375     verDist = y-Y2;
376     horDist = x-X2;
377     //TEST:changed fabs(horDist) to horDist
378     //FIXME: this if statement seems awkward to me.
379     if (horDist > (state->getTransformedFontSize()*maxWordSpacing) || (fabs(verDist) > (state->getTransformedFontSize()/maxLineSpacingDelta))) {
380       beginTextBlock(state,x,y);
381     }
382     else {
383       if ((horDist > (state->getTransformedFontSize()*minWordBreakSpace)) || (fabs(verDist) > (state->getTransformedFontSize()/maxLineSpacingDelta))) {
384         beginWord(state,x,y);
385       }
386     }
387   }
388   else {
389   //This is the first word. Clear all values and call beginWord;
390     X2 = x;
391     Y2 = y;
392     horDist = 0;
393     verDist = 0;
394     height  = 0;
395     beginTextBlock(state,x,y);
396   }
397 }
398 
endString(GfxState * state)399 void ABWOutputDev::endString(GfxState *state) {
400 
401 }
402 
beginWord(GfxState * state,double x,double y)403 void ABWOutputDev::beginWord(GfxState *state, double x, double y){
404   char buf[20];
405 //  printf("***BREAK!***\n");
406   endWord();
407   X1 = x;
408   Y2 = y;
409 
410   horDist = X1-X2;
411   verDist = Y1-Y2;
412 
413   X2 = X1;
414   height = state->getFont()->getAscent() * state->getTransformedFontSize();
415   Y1 = Y2-height;
416 
417   N_word = xmlNewChild(N_Block, NULL, BAD_CAST "word", NULL);
418   sprintf(buf, "%f", X1); xmlNewProp(N_word, BAD_CAST "X1", BAD_CAST buf);
419   sprintf(buf, "%f", Y1); xmlNewProp(N_word, BAD_CAST "Y1", BAD_CAST buf);
420   sprintf(buf, "%d", Style); xmlNewProp(N_word, BAD_CAST "style", BAD_CAST buf);
421 }
422 
endWord()423 void ABWOutputDev::endWord(){
424   char buf[20];
425   if (N_word) {
426     sprintf(buf, "%f", X2);    xmlNewProp(N_word, BAD_CAST "X2", BAD_CAST buf);
427     sprintf(buf, "%f", Y2);    xmlNewProp(N_word, BAD_CAST "Y2", BAD_CAST buf);
428     sprintf(buf, "%f", X2-X1); xmlNewProp(N_word, BAD_CAST "width", BAD_CAST buf);
429     sprintf(buf, "%f", Y2-Y1); xmlNewProp(N_word, BAD_CAST "height", BAD_CAST buf);
430     N_word = NULL;
431   }
432 }
433 
beginTextBlock(GfxState * state,double x,double y)434 void ABWOutputDev::beginTextBlock(GfxState *state, double x, double y){
435   endTextBlock();
436   N_Block = xmlNewChild(N_page, NULL, BAD_CAST "Textblock", NULL);
437   beginWord(state,x,y);
438 }
439 
endTextBlock()440 void ABWOutputDev::endTextBlock(){
441   if (N_Block) {
442     endWord();
443     N_Block = NULL;
444   }
445 }
446 /*
447 This will be a function to retrieve coherent text blocks from the chunk tree.*/
interpretXYTree()448 void ABWOutputDev::interpretXYTree(){
449   xmlNodePtr N_oldPage;
450   N_oldPage = N_page;
451   N_page = xmlNewNode(NULL, BAD_CAST "page");
452   N_column = N_page;
453   //xmlAddChild(N_content, N_page);
454   N_Block = xmlNewChild(N_column, NULL, BAD_CAST "chunk", NULL);
455   ATP_recursive(N_oldPage);
456 }
457 
ATP_recursive(xmlNodePtr N_parent)458 void ABWOutputDev::ATP_recursive(xmlNodePtr N_parent){
459   xmlNodePtr N_first, N_second, N_line, N_tempCol, N_tempColset = NULL;
460 
461   N_first  = N_parent->children;
462   if (!N_first)
463     return;
464 
465   N_second = N_first->next;
466 /*
467   Possibilities:
468   there is one child node
469     Because we cleaned up before the only case where we allow one childnode is
470     within Textblocks and textBlocks within 'vertical' nodes.
471       basically one text node means: add it to the current block.
472   There are two childnodes
473     This can be two verticals, two horizontals or one horizontal and a text node.
474     verticals:
475       If the first is vertical, the second is as well.
476       verticals mean: create a new Block, add a column per vertical make the
477       vertical the block and recurse inside.
478       then make the second vertical the block and recurse inside
479       then finish the block (ie. create a new one)
480     horizontal and or Textblocks
481         if first is textnode
482           add first to block
483           if second is textnode
484             at to block
485           else
486             call again
487         else
488           begin new block
489             call again
490           begin new block
491           if second is text node
492             add to block
493           else
494             call again
495   there are more then two child nodes
496     this can be a number of Textblocks and horizontals
497     add the textNodes to the current Block
498     if a horizontal is encountered enter it and generate a new block afterwards
499   */
500   //fprintf(stderr,"**********************************************************************\n");
501   //xmlSaveFormatFileEnc("-", doc, "UTF-8", 1);
502   switch (xmlLsCountNode(N_parent)) {
503   case 1:
504     //fprintf(stderr,"case 1\n");
505     N_line = xmlNewChild(N_Block, NULL, BAD_CAST "line", NULL);
506     xmlUnlinkNode(N_first);
507     xmlAddChild(N_line, N_first);
508     break;
509   case 2:
510     //fprintf(stderr,"case 2\n");
511     if (xmlStrcasecmp(N_first->name,BAD_CAST "vertical") == 0){
512       //store the column for the moment
513       N_tempCol = N_column;
514       /*If we have three columns they will turn up in the tree as:
515       <vertical>
516         <vertical/>
517         <vertical/>
518       </vertical>
519       <vertical/>
520       */
521       //if the parent is a vertical as well, we can skip the colset generation
522       //thing here we can also remove the just added column and block, because
523       //these are going to replace them
524       if (xmlStrcasecmp(N_parent->name,BAD_CAST "vertical") != 0){
525         //fprintf(stderr,"first time column\n");
526         N_tempColset = N_colset;
527         N_colset = xmlNewChild(N_column, NULL, BAD_CAST "colset", NULL);
528         N_column = xmlNewChild(N_colset, NULL, BAD_CAST "column", NULL);
529         N_Block = xmlNewChild(N_column, NULL, BAD_CAST "chunk", NULL);
530       }
531       else {
532         //fprintf(stderr,"second time column\n");
533         xmlUnlinkNode(N_column);
534         N_column = xmlNewChild(N_colset, NULL, BAD_CAST "column", NULL);
535         N_Block = xmlNewChild(N_column, NULL, BAD_CAST "chunk", NULL);
536       }
537       //fprintf(stderr,"Building first column...\n");
538       ATP_recursive(N_first);
539       N_column = xmlNewChild(N_colset, NULL, BAD_CAST "column", NULL);
540       N_Block = xmlNewChild(N_column, NULL, BAD_CAST "chunk", NULL);
541       //fprintf(stderr,"Building second column...\n");
542       ATP_recursive(N_second);
543       //make sure we end the column by continuing in the master column and
544       //setting the block and line to it
545       N_column = N_tempCol;
546       if (xmlStrcasecmp(N_parent->name,BAD_CAST "vertical") != 0){
547         if (N_tempColset != NULL)
548           N_colset = N_tempColset;
549         else
550           fprintf(stderr,"N_templColset should not! be empty (line 823)");//FIXME: add assert
551       }
552     }
553     else {
554       if (xmlStrcasecmp(N_first->name,BAD_CAST "Textblock") == 0) {
555         //fprintf(stderr,"add first as textblock\n");
556         N_line = xmlNewChild(N_Block, NULL, BAD_CAST "line", NULL);
557         xmlUnlinkNode(N_first);
558         xmlAddChild(N_line, N_first);
559         if (xmlStrcasecmp(N_second->name,BAD_CAST "Textblock") == 0) {
560           //fprintf(stderr,"add second as textblock\n");
561           //FIXME: this is not neat. We should ignore the cut ignoring when there are only two elements above
562           //line aggregation doesn't work anyway atm.
563           xmlUnlinkNode(N_second);
564           xmlAddChild(N_line, N_second);
565           //We have two textChunks that are going to be added to the line.
566           //the following statements make the line wrap around both textblocks
567           //if the firstX1 is smaller then the second X1 use the first, else use the second etc.
568         }
569         else {
570           //fprintf(stderr,"recursing into second\n");
571           ATP_recursive(N_second);
572         }
573       }
574       else {
575         N_Block = xmlNewChild(N_column, NULL, BAD_CAST "chunk", NULL);
576         //fprintf(stderr,"recursing into first\n");
577         ATP_recursive(N_first);
578         N_Block = xmlNewChild(N_column, NULL, BAD_CAST "chunk", NULL);
579         if (xmlStrcasecmp(N_second->name,BAD_CAST "Textblock") == 0) {
580           //fprintf(stderr,"add second as textblock\n");
581           N_line = xmlNewChild(N_Block, NULL, BAD_CAST "line", NULL);
582           xmlUnlinkNode(N_second);
583           xmlAddChild(N_line, N_second);
584         }
585         else {
586           //fprintf(stderr,"recursing into second\n");
587           ATP_recursive(N_second);
588         }
589       }
590     }
591     break;
592   default:
593     //double tX1=0, tX2=0, tY1=0, tY2=0;
594     //fprintf(stderr,"case default\n");
595     N_line = xmlNewChild(N_Block, NULL, BAD_CAST "line", NULL);
596     while (N_first){
597       //xmlXPathCastStringToNumber(xmlGetProp(N_first,BAD_CAST "X1")) < tX1 ? tX1 = xmlXPathCastStringToNumber(xmlGetProp(N_first,BAD_CAST "X1")) : tX1 = tX1;
598       //xmlXPathCastStringToNumber(xmlGetProp(N_first,BAD_CAST "X2")) > tX2 ? tX2 = xmlXPathCastStringToNumber(xmlGetProp(N_first,BAD_CAST "X2")) : tX2 = tX2;
599       //xmlXPathCastStringToNumber(xmlGetProp(N_first,BAD_CAST "Y1")) < tY1 ? tY1 = xmlXPathCastStringToNumber(xmlGetProp(N_first,BAD_CAST "Y1")) : tY1 = tY1;
600       //xmlXPathCastStringToNumber(xmlGetProp(N_first,BAD_CAST "Y2")) > tY2 ? tY2 = xmlXPathCastStringToNumber(xmlGetProp(N_first,BAD_CAST "Y2")) : tY1 = tY2;
601       N_second = N_first->next;
602       if (xmlStrcasecmp(N_first->name,BAD_CAST "Textblock") == 0){
603         xmlUnlinkNode(N_first);
604         xmlAddChild(N_line, N_first);
605       }
606       else { //fprintf(stderr,"This shouldn't happen! (line 700)\n");
607       }
608       N_first = N_second;
609     }
610     break;
611   }
612 }
613 
614 /*The cleanup function. It started out as a simple function to remove empty nodes
615 so that I could call xmladdnewchildnode as often as I liked so that I wouldn't get seg-faults
616 It is now a bit more advanced, makes sure the tree is as it's supposed to be and adds information too*/
cleanUpNode(xmlNodePtr N_parent,bool aggregateInfo)617 void ABWOutputDev::cleanUpNode(xmlNodePtr N_parent, bool aggregateInfo){
618   double tX1=-1, tX2=-1, tY1=-1, tY2=-1;
619   xmlNodePtr N_cur, N_next;
620   N_cur = N_parent->children;
621   char buf[20];
622   int prevStyle = -1;
623   xmlChar *val;
624   int styleLength = xmlLsCountNode(N_styleset)+1;
625   float stylePos;
626   int *styles = new int[styleLength];
627   for (int i=1; i< styleLength; i++) { styles[i] = 0;}
628   /*
629   ignore two horizontal nodes with textBlocks right underneath them. They
630   signal the end of a chunk, and the horizontal seperation needs to be
631   preserved, because it means they are different lines. The second horizontal
632   therefore needs to be kept.
633   */
634   if ((xmlLsCountNode(N_parent) == 2)
635       &&
636      xmlStrcasecmp(N_parent->name,BAD_CAST "horizontal") == 0
637       &&
638      N_cur
639       &&
640      N_cur->next
641       &&
642      xmlStrcasecmp(N_cur->name,BAD_CAST "horizontal") == 0 && xmlStrcasecmp(N_cur->next->name,BAD_CAST "horizontal") == 0
643       &&
644      xmlLsCountNode(N_cur) == 1 && xmlLsCountNode(N_cur->next) == 1
645       &&
646      xmlStrcasecmp(N_cur->children->name,BAD_CAST "Textblock") == 0 && xmlStrcasecmp(N_cur->next->children->name,BAD_CAST "Textblock") == 0
647      ) {
648     xmlAddPrevSibling(N_cur->next,N_cur->children);
649     xmlUnlinkNode(N_cur);
650   }
651   /*
652   This removes columns if one of the parts is actually a single letter.
653   I found out I liked the columns better, so I have the code commented out.
654   */
655 /*  else if ((xmlLsCountNode(N_parent) == 2)
656              &&
657             N_cur
658              &&
659             N_cur->next
660              &&
661             xmlStrcasecmp(N_cur->name,BAD_CAST "vertical") == 0
662              &&
663             xmlStrcasecmp(N_cur->next->name,BAD_CAST "vertical") == 0
664              &&
665             (N_cur->children)
666              &&
667             (N_cur->children->children)
668              &&
669             (N_cur->children->children->children)
670              &&
671             xmlStrlen(N_cur->children->children->children->content) == 1) {
672     N_next = N_cur->next;
673     xmlAddChild(N_parent, N_next->children);
674     xmlAddPrevSibling(N_next->children->children, N_cur->children);
675     xmlUnlinkNode(N_cur);
676     xmlUnlinkNode(N_next);
677   } */else {
678     while (N_cur){
679       N_next = N_cur->next;
680       cleanUpNode(N_cur, aggregateInfo);
681       if (xmlLsCountNode(N_cur) == 0 && (xmlStrcasecmp(N_cur->name,BAD_CAST "cbr") != 0) && (xmlStrcasecmp(N_cur->name,BAD_CAST "s") != 0))
682         xmlUnlinkNode(N_cur);
683       //If the node is still around
684       N_cur = N_next;
685     }
686   }
687   //If a countainer element has only one child, it can be removed except for vertical
688   //cuts with only one textElement;
689   //the main reason for this code is to remove the crumbs after cleaning up in the loop above
690   if ((xmlLsCountNode(N_parent) == 1) && ((xmlStrcasecmp(N_parent->name,BAD_CAST "horizontal") == 0) || ((xmlStrcasecmp(N_parent->name,BAD_CAST "vertical") == 0) && (xmlStrcasecmp(N_parent->children->name,BAD_CAST "Textblock") != 0)))){
691     N_cur = N_parent->children;
692     xmlAddPrevSibling(N_parent,N_cur);
693     xmlUnlinkNode(N_parent);
694   }
695   //We cannot remove the page element so if it has only one childnode, we remove that childnode instead
696   if ((xmlStrcasecmp(N_parent->name,BAD_CAST "page") == 0) && (xmlLsCountNode(N_parent) == 1)) {
697     N_cur = N_parent->children->children;
698     while (N_cur){
699       N_next = N_cur->next;
700       xmlUnlinkNode(N_cur);
701       xmlAddChild(N_parent, N_cur);
702       N_cur = N_next;
703     }
704     xmlUnlinkNode(N_parent->children);
705   }
706   //Ok, so by this time the N_parent and his children are guaranteed to be clean
707   //this for loop gets information from the 'word' elements and propagates it up
708   //the tree.
709   if (aggregateInfo && xmlStrcasecmp(N_parent->name,BAD_CAST "word") != 0) {
710     for (N_cur = N_parent->children; N_cur; N_cur = N_cur->next){
711       val = xmlGetProp(N_cur,BAD_CAST "style");
712       stylePos = xmlXPathCastStringToNumber(val);
713       //fprintf(stderr,"1: %f, %d\n",stylePos,int(stylePos));
714       styles[int(stylePos)]=styles[int(stylePos)]+1;
715       //fprintf(stderr,"2: styles[%d] = %d\n",int(stylePos),styles[int(stylePos)]);
716       (xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "X1")) < tX1 || tX1 == -1)? tX1 = xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "X1")) : tX1 = tX1;
717       (xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "X2")) > tX2)             ? tX2 = xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "X2")) : tX2 = tX2;
718       (xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "Y1")) < tY1 || tY1 == -1)? tY1 = xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "Y1")) : tY1 = tY1;
719       (xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "Y2")) > tY2)             ? tY2 = xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "Y2")) : tY2 = tY2;
720     }
721     sprintf(buf, "%f", tX1);     xmlSetProp(N_parent, BAD_CAST "X1", BAD_CAST buf);
722     sprintf(buf, "%f", tX2);     xmlSetProp(N_parent, BAD_CAST "X2", BAD_CAST buf);
723     sprintf(buf, "%f", tY1);     xmlSetProp(N_parent, BAD_CAST "Y1", BAD_CAST buf);
724     sprintf(buf, "%f", tY2);     xmlSetProp(N_parent, BAD_CAST "Y2", BAD_CAST buf);
725     sprintf(buf, "%f", tX2-tX1); xmlSetProp(N_parent, BAD_CAST "width", BAD_CAST buf);
726     sprintf(buf, "%f", tY2-tY1); xmlSetProp(N_parent, BAD_CAST "height", BAD_CAST buf);
727     prevStyle = 0;
728     styles[0] = -1;
729     for (int i=1; i< styleLength; i++) { if (styles[i] > styles[prevStyle]) prevStyle = i; }
730     //fprintf(stderr,"%d\n", prevStyle);
731     if (prevStyle > 0){
732       sprintf(buf, "%d", prevStyle);     xmlSetProp(N_parent, BAD_CAST "style", BAD_CAST buf);
733     }
734   }
735   if (N_parent->children && xmlStrcasecmp(N_parent->children->name,BAD_CAST "line") == 0 && xmlGetProp(N_parent->children,BAD_CAST "alignment") != NULL)
736     xmlSetProp(N_parent, BAD_CAST "alignment", xmlGetProp(N_parent->children,BAD_CAST "alignment"));
737 
738    delete[] styles;
739 }
740 
generateParagraphs()741 void ABWOutputDev::generateParagraphs() {
742   xmlNodePtr N_cur, N_parent, N_p, N_line, N_next;
743   int lvl;
744   //basically I first detect the text-alignment within blocks.
745   //ASSUMPTION: my block seperation thing is good enough so I don't need to
746   //worry about two alignments in one paragraph
747 
748   X1 = 0;
749   X2 = pdfdoc->getPageCropWidth(G_pageNum);
750   Y1 = 0;
751   Y2 = pdfdoc->getPageCropHeight(G_pageNum);
752   addAlignment(N_page);
753 
754   //then it's a switch per alignement
755   N_cur = N_page->children;
756   N_parent = N_page;
757   lvl = 1;
758   while (N_cur) {
759     if (xmlStrcasecmp(N_cur->name,BAD_CAST "chunk") == 0){
760       N_p = xmlNewNode(NULL, BAD_CAST "chunk");
761       xmlAddPrevSibling(N_cur,N_p);
762       //N_p = xmlNewChild(N_parent, NULL, BAD_CAST "chunk", NULL);
763       //A new paragraph is created when:
764       switch (int(xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "alignment")))){
765       //left
766       case 1: //the distance between the texblock X2 and the last word X2 is more than
767          //the following first word width.
768          N_line = N_cur->children;
769          while (N_line){
770            N_next = N_line->next;
771            xmlUnlinkNode(N_line);
772            xmlAddChild(N_p,N_line);
773            xmlSetProp(N_line, BAD_CAST "alignment", BAD_CAST "1");
774            if (N_next && xmlStrcasecmp(N_next->name,BAD_CAST "line") == 0){
775              if (xmlXPathCastStringToNumber(xmlGetProp(N_next->children->children,BAD_CAST "width")) < (xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "width")) - xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "width")))){
776                N_p = xmlNewNode(NULL, BAD_CAST "chunk");
777                xmlAddPrevSibling(N_cur,N_p);
778              }
779            }
780            N_line = N_next;
781          }
782          break;
783       //right
784       case 2: //the same but now with X1 and first word and following last word
785          N_line = N_cur->children;
786          while (N_line){
787            N_next = N_line->next;
788            xmlUnlinkNode(N_line);
789            xmlAddChild(N_p,N_line);
790            xmlSetProp(N_line, BAD_CAST "alignment", BAD_CAST "2");
791            if (N_next && xmlStrcasecmp(N_next->name,BAD_CAST "line") == 0){
792              //fprintf(stderr,"width_next=%f, X2_bl=%f, X2_w=%f\n",xmlXPathCastStringToNumber(xmlGetProp(N_next->children->children,BAD_CAST "width")),xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "width")),xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "width")));
793              if (xmlXPathCastStringToNumber(xmlGetProp(N_next->children->children,BAD_CAST "width")) < (xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "width")) - xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "width")))){
794                N_p = xmlNewNode(NULL, BAD_CAST "chunk");
795                xmlAddPrevSibling(N_cur,N_p);
796              }
797            }
798            N_line = N_next;
799          }
800          break;
801       //centered
802       case 3: //the combined left and right space is more than the following first word
803          N_line = N_cur->children;
804          while (N_line){
805            N_next = N_line->next;
806            xmlUnlinkNode(N_line);
807            xmlAddChild(N_p,N_line);
808            xmlSetProp(N_line, BAD_CAST "alignment", BAD_CAST "3");
809            if (N_next && xmlStrcasecmp(N_next->name,BAD_CAST "line") == 0){
810              //fprintf(stderr,"width_next=%f, X2_bl=%f, X2_w=%f\n",xmlXPathCastStringToNumber(xmlGetProp(N_next->children->children,BAD_CAST "width")),xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "width")),xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "width")));
811              if (xmlXPathCastStringToNumber(xmlGetProp(N_next->children->children,BAD_CAST "width")) < (xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "width")) - xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "width")))){
812                N_p = xmlNewNode(NULL, BAD_CAST "chunk");
813                xmlAddPrevSibling(N_cur,N_p);
814              }
815            }
816            N_line = N_next;
817          }
818          break;
819       //justified
820       case 4:
821          //we break on all alignment=1 lines. A line with alignment=1 that is the first of a block will
822          //also initiate a paragraph break before.
823          N_line = N_cur->children;
824          if (xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "alignment")) == 1){
825            N_p = xmlNewNode(NULL, BAD_CAST "chunk");
826            xmlAddPrevSibling(N_cur,N_p);
827          }
828          while (N_line){
829            N_next = N_line->next;
830            xmlUnlinkNode(N_line);
831            xmlAddChild(N_p,N_line);
832            if (xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "alignment")) == 1){
833              N_p = xmlNewNode(NULL, BAD_CAST "chunk");
834              xmlAddPrevSibling(N_cur,N_p);
835            }
836            xmlSetProp(N_line, BAD_CAST "alignment", BAD_CAST "4");
837            N_line = N_next;
838          }
839          break;
840       }
841     }
842     else if (xmlStrcasecmp(N_cur->name,BAD_CAST "colset") == 0 || xmlStrcasecmp(N_cur->name,BAD_CAST "column") == 0){
843       N_parent = N_cur;
844       N_cur = N_cur->children;
845       lvl++;
846       N_p = xmlNewNode(NULL, BAD_CAST "chunk");
847       xmlAddPrevSibling(N_cur,N_p);
848       continue;
849     }
850     if (N_cur->next)
851       N_cur = N_cur->next;
852     else while (lvl > 0){
853       N_cur = N_parent;
854       N_parent = N_cur->parent;
855       lvl--;
856       if (N_cur->next){
857         N_cur = N_cur->next;
858         break;
859       }
860     }
861     if (lvl==0)
862       N_cur = NULL;
863   }
864 }
865 
866 //function that adds an 'alignment=' property to the <chunk>s
addAlignment(xmlNodePtr N_parent)867 void ABWOutputDev::addAlignment(xmlNodePtr N_parent) {
868   xmlNodePtr N_chunk, N_line;
869   double tX1, tX2;
870   bool leftMatch, rightMatch, centerMatch;
871   int leftCnt = 0, rightCnt = 0, cntrCnt = 0, justCnt = 0;
872   //fprintf(stderr,"Entering addAlignment\n");
873   for (N_chunk = N_parent->children; N_chunk; N_chunk = N_chunk->next) {
874     if (xmlStrcasecmp(N_chunk->name,BAD_CAST "chunk") == 0){
875       X1 = xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X1"));
876       X2 = xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X2"));
877       //fprintf(stderr,"Found chunk\n");
878       //if the chunk contains only one line, we don't need to loop through it.
879       if (xmlLsCountNode(N_chunk) == 1){
880         //fprintf(stderr,"Processing line\n");
881         //fprintf(stderr,"X1=%f, X2=%f, cX1=%f, cX2=%f\n",X1,X2,xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X1")), xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X2")));
882         //fprintf(stderr,"%f\n",(xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X1")) - X1)-(X2 - xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X2"))));
883         //fprintf(stderr,"cX1-X1=%f, X2-cX2=%f\n",(xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X1")) - X1),(X2 - xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X2"))));
884         // a one line chunk, is either centered or left or right-aligned.
885         if ((xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X1"))-X1)-(X2-xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X2"))) > 1) {
886           xmlNewProp(N_chunk, BAD_CAST "alignment", BAD_CAST "2");
887           xmlNewProp(N_chunk->children, BAD_CAST "alignment", BAD_CAST "2");
888           //fprintf(stderr,"alignment = right\n");
889         }
890         else {
891         if ((xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X1"))-X1)-(X2 - xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X2")))< -1) {
892           xmlNewProp(N_chunk, BAD_CAST "alignment", BAD_CAST "1");
893           xmlNewProp(N_chunk->children, BAD_CAST "alignment", BAD_CAST "1");
894           //fprintf(stderr,"alignment = left\n");
895         }
896         else {
897           xmlNewProp(N_chunk, BAD_CAST "alignment", BAD_CAST "3");
898           xmlNewProp(N_chunk->children, BAD_CAST "alignment", BAD_CAST "3");
899           //fprintf(stderr,"alignment = center\n");
900         }
901         }
902       }
903       else {
904       leftCnt = 0;
905       rightCnt = 0;
906       cntrCnt = 0;
907       justCnt = 0;
908       for (N_line = N_chunk->children; N_line; N_line = N_line->next) {
909         //fprintf(stderr,"Processing line\n");
910         /*
911         |X1 - cX1| == 1
912         |X2 - cX2| == 1
913         |(cX1-X1)-(X2-cX2)| == 1
914         ok, each line can be just as wide as the current set,
915         it can be smaller and moved to the right
916         it can be smaller and moved to the left.
917         it can
918         */
919         //fprintf(stderr,"X1=%f, X2=%f, cX1=%f, cX2=%f\n",X1,X2,xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "X1")), xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "X2")));
920         //fprintf(stderr,"cX1-X1=%f, X2-cX2=%f\n",(xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "X1")) - X1),(X2 - xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "X2"))));
921         leftMatch =  fabs(xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "X1"))-X1) < 2;
922         rightMatch =  fabs(X2-xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "X2"))) < 2;
923         centerMatch =  fabs((xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "X1"))-X1)-(X2-xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "X2")))) < 2;
924         if (leftMatch && rightMatch) {
925           xmlNewProp(N_line, BAD_CAST "alignment", BAD_CAST "4");
926           justCnt++;
927         }
928         else if (centerMatch) {
929           xmlNewProp(N_line, BAD_CAST "alignment", BAD_CAST "3");
930           cntrCnt++;
931         }
932         else if (rightMatch) {
933           xmlNewProp(N_line, BAD_CAST "alignment", BAD_CAST "2");
934           rightCnt++;
935         }
936         else {
937           xmlNewProp(N_line, BAD_CAST "alignment", BAD_CAST "1");
938           leftCnt++;
939         }
940       }
941       //there is almost always one justified line in a centered text
942       //and most justified blocks have at least one left aligned line
943       //fprintf(stderr,"1:%d ,2:%d ,3:%d ,4:%d\n",leftCnt,justCnt,cntrCnt,rightCnt);
944       if ((leftCnt-1 >= justCnt) && (leftCnt >= rightCnt) && (leftCnt >= cntrCnt))
945         xmlNewProp(N_chunk, BAD_CAST "alignment", BAD_CAST "1");
946       else if ((justCnt >= leftCnt-1) && (justCnt >= rightCnt) && (justCnt >= cntrCnt))
947         xmlNewProp(N_chunk, BAD_CAST "alignment", BAD_CAST "4");
948       else if ((cntrCnt >= justCnt-1) && (cntrCnt >= rightCnt) && (cntrCnt >= leftCnt))
949         xmlNewProp(N_chunk, BAD_CAST "alignment", BAD_CAST "3");
950       else
951         xmlNewProp(N_chunk, BAD_CAST "alignment", BAD_CAST "2");
952       }
953     }
954     else {
955       if (xmlStrcasecmp(N_chunk->name,BAD_CAST "colset") == 0){
956         //fprintf(stderr,"Found a colset\n");
957         addAlignment(N_chunk);
958       }
959       else {
960         if (xmlStrcasecmp(N_chunk->name,BAD_CAST "column") == 0){
961           //fprintf(stderr,"Found a column\n");
962           tX1 = X1;
963           tX2 = X2;
964           X1 = xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X1"));
965           X2 = xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X2"));
966           addAlignment(N_chunk);
967           X1 = tX1;
968           X2 = tX2;
969         }
970         else { //fprintf(stderr,"Found something else\n");
971 	}
972       }
973     }
974   }
975 //parse all blocks, and all lines within all blocks
976 //do a set of checks and tick a flag if the check fails
977 //check for line X1 is textBlock X1
978 //check for line X2 is textblock X2
979 //check if line is centered in textBock (LX1 != TX1 && LX2 != TX2 && LX1-TX1 == TX2=LX2)
980 //if the LX1 != TX1 then how much is the difference?
981 //a line isn't left aligned if all lines have a different X1 <= not so strong assumption.
982 
983 //justified if both are straight except for a couple of (same factor sized) indents at the left
984 //else centered if above calculation is correct
985 //else left aligned if left side is more straight than right (more lines in the same X1 or common factor
986 //else right
987 }
988 
setPDFDoc(PDFDoc * priv_pdfdoc)989 void ABWOutputDev::setPDFDoc(PDFDoc *priv_pdfdoc) {
990   pdfdoc = priv_pdfdoc;
991 }
992 
createABW()993 void ABWOutputDev::createABW() {
994   //*************************************************************
995   //change styles to abiword format
996   xmlNodePtr N_cur, N_next;
997   xmlAttrPtr N_prop;
998   char buf[500];
999   for (N_cur = N_styleset->children; N_cur; N_cur = N_cur->next){
1000     sprintf(buf,"margin-top:0pt; color:000000; margin-left:0pt; text-position:normal; widows:2; text-indent:0in; font-variant:normal; margin-right:0pt; lang:nl-NL; line-height:1.0; font-size:%dpt; text-decoration:none; margin-bottom:0pt; bgcolor:transparent; text-align:left; font-stretch:normal;",int(xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "size"))));
1001     strncat(buf,"font-family:",12);
1002     strncat(buf,(char *)xmlGetProp(N_cur,BAD_CAST "font"),strlen((char *)xmlGetProp(N_cur,BAD_CAST "font")));
1003     strncat(buf,";",1);
1004     strncat(buf,"font-weight:",12);
1005     strncat(buf,(char *)xmlGetProp(N_cur,BAD_CAST "bold"),strlen((char *)xmlGetProp(N_cur,BAD_CAST "bold")));
1006     strncat(buf,"font-style:",12);
1007     strncat(buf,(char *)xmlGetProp(N_cur,BAD_CAST "italic"),strlen((char *)xmlGetProp(N_cur,BAD_CAST "italic")));
1008     xmlSetProp(N_cur, BAD_CAST "props", BAD_CAST buf);
1009     N_prop = xmlHasProp(N_cur, BAD_CAST "id");
1010     if (N_prop != NULL) xmlRemoveProp(N_prop);
1011     N_prop = xmlHasProp(N_cur, BAD_CAST "size");
1012     if (N_prop != NULL) xmlRemoveProp(N_prop);
1013     N_prop = xmlHasProp(N_cur, BAD_CAST "bold");
1014     if (N_prop != NULL) xmlRemoveProp(N_prop);
1015     N_prop = xmlHasProp(N_cur, BAD_CAST "italic");
1016     if (N_prop != NULL) xmlRemoveProp(N_prop);
1017     N_prop = xmlHasProp(N_cur, BAD_CAST "font");
1018     if (N_prop != NULL) xmlRemoveProp(N_prop);
1019   }
1020   //*************************************************************
1021   //Change the rest of the document
1022   //each child of N_content is a page
1023   N_cur = N_content->children;
1024   while (N_cur){
1025     //we creat a section node and attach it to the root, it will com after all
1026     //the page nodes. Then we transform the page, and finally remove it
1027     N_next = N_cur->next;
1028     //fprintf(stderr,"***Transforming page\n");
1029     N_Block = xmlNewChild(N_root, NULL, BAD_CAST "section", NULL);
1030     transformPage(N_cur);
1031     xmlUnlinkNode(N_cur);
1032     //fprintf(stderr,"***Finished transforming page\n");
1033     N_cur = N_next;
1034   }
1035   cleanUpNode(N_root, false);
1036 }
1037 
transformPage(xmlNodePtr N_parent)1038 void ABWOutputDev::transformPage(xmlNodePtr N_parent){
1039   char buf[60];
1040   xmlNodePtr N_cur, N_curLine, N_curText, N_curWord, text, space;
1041   //translate the nodes into abiword nodes
1042   if (xmlStrcasecmp(N_parent->name,BAD_CAST "page") == 0){
1043     for (N_cur = N_parent->children; N_cur; N_cur = N_cur->next){
1044       //fprintf(stderr,"**pass a page child\n");
1045       transformPage(N_cur);
1046     }
1047   }
1048   if (xmlStrcasecmp(N_parent->name,BAD_CAST "chunk") == 0){
1049     //fprintf(stderr,"Found a chunk\n");
1050     //I start a <p> on each chunk and add all word containment
1051     N_text = xmlNewChild(N_Block, NULL, BAD_CAST "p", NULL);
1052     if (int(xmlXPathCastStringToNumber(xmlGetProp(N_parent,BAD_CAST "style"))) > 0){
1053       xmlNewProp(N_text, BAD_CAST "style", xmlGetProp(N_parent,BAD_CAST "style"));
1054     }
1055     switch (int(xmlXPathCastStringToNumber(xmlGetProp(N_parent,BAD_CAST "alignment")))){
1056     case 1: xmlNewProp(N_text, BAD_CAST "props", BAD_CAST "text-align:left");
1057            break;
1058     case 2: xmlNewProp(N_text, BAD_CAST "props", BAD_CAST "text-align:right");
1059            break;
1060     case 3: xmlNewProp(N_text, BAD_CAST "props", BAD_CAST "text-align:center");
1061            break;
1062     case 4: xmlNewProp(N_text, BAD_CAST "props", BAD_CAST "text-align:justify");
1063            break;
1064     }
1065     for (N_curLine = N_parent->children; N_curLine; N_curLine = N_curLine->next){
1066       //fprintf(stderr,"A line\n");
1067       for (N_curText = N_curLine->children; N_curText; N_curText = N_curText->next){
1068         //fprintf(stderr,"a textNode\n");
1069         for (N_curWord = N_curText->children; N_curWord; N_curWord = N_curWord->next){
1070           //fprintf(stderr,"a word\n");
1071           text = N_curWord->children;
1072           xmlUnlinkNode(text);
1073           xmlAddChild(N_text,text);
1074           space = xmlNewText(BAD_CAST " ");
1075           xmlAddChild(N_text,space);
1076         }
1077       }
1078     }
1079   }
1080   if (xmlStrcasecmp(N_parent->name,BAD_CAST "column") == 0){
1081     //fprintf(stderr,"Found a column\n");
1082     for (N_cur = N_parent->children; N_cur; N_cur = N_cur->next){
1083       transformPage(N_cur);
1084     }
1085     xmlNewChild(N_text, NULL, BAD_CAST "cbr", NULL);
1086   }
1087   if (xmlStrcasecmp(N_parent->name,BAD_CAST "colset") == 0){
1088     //fprintf(stderr,"Found a colset\n");
1089     //create new section columns: count childNodes of N_cur
1090     //recurse through chunks and create textNodes
1091     N_Block = xmlNewChild(N_root, NULL, BAD_CAST "section", NULL);
1092     sprintf(buf,"columns:%d",xmlLsCountNode(N_parent));
1093     xmlNewProp(N_Block, BAD_CAST "props", BAD_CAST buf);
1094     for (N_cur = N_parent->children; N_cur; N_cur = N_cur->next){
1095       transformPage(N_cur);
1096     }
1097     N_Block = xmlNewChild(N_root, NULL, BAD_CAST "section", NULL);
1098   }
1099   //fprintf(stderr,"at the end\n");
1100 }
1101 
1102 //Count nodes, copied from debugxml.c from libxml
1103 // libxml copyright file below
1104 /*
1105 Except where otherwise noted in the source code (e.g. the files hash.c,
1106 list.c and the trio files, which are covered by a similar licence but
1107 with different Copyright notices) all the files are:
1108 
1109  Copyright (C) 1998-2003 Daniel Veillard.  All Rights Reserved.
1110 
1111 Permission is hereby granted, free of charge, to any person obtaining a copy
1112 of this software and associated documentation files (the "Software"), to deal
1113 in the Software without restriction, including without limitation the rights
1114 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
1115 copies of the Software, and to permit persons to whom the Software is fur-
1116 nished to do so, subject to the following conditions:
1117 
1118 The above copyright notice and this permission notice shall be included in
1119 all copies or substantial portions of the Software.
1120 
1121 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1122 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FIT-
1123 NESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
1124 DANIEL VEILLARD BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
1125 IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CON-
1126 NECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
1127 
1128 Except as contained in this notice, the name of Daniel Veillard shall not
1129 be used in advertising or otherwise to promote the sale, use or other deal-
1130 ings in this Software without prior written authorization from him.
1131 */
xmlLsCountNode(xmlNodePtr node)1132 int ABWOutputDev::xmlLsCountNode(xmlNodePtr node) {
1133   int ret = 0;
1134   xmlNodePtr list = NULL;
1135 
1136   if (node == NULL)
1137     return(0);
1138 
1139   switch (node->type) {
1140     case XML_ELEMENT_NODE:
1141       list = node->children;
1142       break;
1143     case XML_DOCUMENT_NODE:
1144     case XML_HTML_DOCUMENT_NODE:
1145 #ifdef LIBXML_DOCB_ENABLED
1146     case XML_DOCB_DOCUMENT_NODE:
1147 #endif
1148       list = ((xmlDocPtr) node)->children;
1149       break;
1150     case XML_ATTRIBUTE_NODE:
1151       list = ((xmlAttrPtr) node)->children;
1152       break;
1153     case XML_TEXT_NODE:
1154     case XML_CDATA_SECTION_NODE:
1155     case XML_PI_NODE:
1156     case XML_COMMENT_NODE:
1157       if (node->content != NULL) {
1158         ret = xmlStrlen(node->content);
1159       }
1160       break;
1161     case XML_ENTITY_REF_NODE:
1162     case XML_DOCUMENT_TYPE_NODE:
1163     case XML_ENTITY_NODE:
1164     case XML_DOCUMENT_FRAG_NODE:
1165     case XML_NOTATION_NODE:
1166     case XML_DTD_NODE:
1167     case XML_ELEMENT_DECL:
1168     case XML_ATTRIBUTE_DECL:
1169     case XML_ENTITY_DECL:
1170     case XML_NAMESPACE_DECL:
1171     case XML_XINCLUDE_START:
1172     case XML_XINCLUDE_END:
1173       ret = 1;
1174       break;
1175   }
1176   for (;list != NULL;ret++)
1177     list = list->next;
1178   return(ret);
1179 }
1180