1 //C-  -*- C++ -*-
2 //C- -------------------------------------------------------------------
3 //C- DjVuLibre-3.5
4 //C- Copyright (c) 2002  Leon Bottou and Yann Le Cun.
5 //C- Copyright (c) 2001  AT&T
6 //C-
7 //C- This software is subject to, and may be distributed under, the
8 //C- GNU General Public License, either Version 2 of the license,
9 //C- or (at your option) any later version. The license should have
10 //C- accompanied the software or you may obtain a copy of the license
11 //C- from the Free Software Foundation at http://www.fsf.org .
12 //C-
13 //C- This program is distributed in the hope that it will be useful,
14 //C- but WITHOUT ANY WARRANTY; without even the implied warranty of
15 //C- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 //C- GNU General Public License for more details.
17 //C-
18 //C- DjVuLibre-3.5 is derived from the DjVu(r) Reference Library from
19 //C- Lizardtech Software.  Lizardtech Software has authorized us to
20 //C- replace the original DjVu(r) Reference Library notice by the following
21 //C- text (see doc/lizard2002.djvu and doc/lizardtech2007.djvu):
22 //C-
23 //C-  ------------------------------------------------------------------
24 //C- | DjVu (r) Reference Library (v. 3.5)
25 //C- | Copyright (c) 1999-2001 LizardTech, Inc. All Rights Reserved.
26 //C- | The DjVu Reference Library is protected by U.S. Pat. No.
27 //C- | 6,058,214 and patents pending.
28 //C- |
29 //C- | This software is subject to, and may be distributed under, the
30 //C- | GNU General Public License, either Version 2 of the license,
31 //C- | or (at your option) any later version. The license should have
32 //C- | accompanied the software or you may obtain a copy of the license
33 //C- | from the Free Software Foundation at http://www.fsf.org .
34 //C- |
35 //C- | The computer code originally released by LizardTech under this
36 //C- | license and unmodified by other parties is deemed "the LIZARDTECH
37 //C- | ORIGINAL CODE."  Subject to any third party intellectual property
38 //C- | claims, LizardTech grants recipient a worldwide, royalty-free,
39 //C- | non-exclusive license to make, use, sell, or otherwise dispose of
40 //C- | the LIZARDTECH ORIGINAL CODE or of programs derived from the
41 //C- | LIZARDTECH ORIGINAL CODE in compliance with the terms of the GNU
42 //C- | General Public License.   This grant only confers the right to
43 //C- | infringe patent claims underlying the LIZARDTECH ORIGINAL CODE to
44 //C- | the extent such infringement is reasonably necessary to enable
45 //C- | recipient to make, have made, practice, sell, or otherwise dispose
46 //C- | of the LIZARDTECH ORIGINAL CODE (or portions thereof) and not to
47 //C- | any greater extent that may be necessary to utilize further
48 //C- | modifications or combinations.
49 //C- |
50 //C- | The LIZARDTECH ORIGINAL CODE is provided "AS IS" WITHOUT WARRANTY
51 //C- | OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
52 //C- | TO ANY WARRANTY OF NON-INFRINGEMENT, OR ANY IMPLIED WARRANTY OF
53 //C- | MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
54 //C- +------------------------------------------------------------------
55 
56 #ifdef HAVE_CONFIG_H
57 # include "config.h"
58 #endif
59 #if NEED_GNUG_PRAGMAS
60 # pragma implementation
61 #endif
62 
63 // From: Leon Bottou, 1/31/2002
64 // This is purely Lizardtech stuff.
65 
66 #include "XMLParser.h"
67 #include "XMLTags.h"
68 #include "ByteStream.h"
69 #include "GOS.h"
70 #include "DjVuDocument.h"
71 #include "DjVuText.h"
72 #include "DjVuAnno.h"
73 #include "DjVuFile.h"
74 #include "DjVuImage.h"
75 #include "debug.h"
76 #include <stdio.h>
77 #include <ctype.h>
78 #include <stddef.h>
79 #include <stdlib.h>
80 
81 
82 #ifdef HAVE_NAMESPACES
83 namespace DJVU {
84 # ifdef NOT_DEFINED // Just to fool emacs c++ mode
85 }
86 #endif
87 #endif
88 
89 static const char mimetype[]="image/x.djvu";
90 static const char bodytag[]="BODY";
91 static const char areatag[]="AREA";
92 static const char maptag[]="MAP";
93 static const char objecttag[]="OBJECT";
94 static const char paramtag[]="PARAM";
95 static const char charactertag[]="CHARACTER";
96 static const char wordtag[]="WORD";
97 static const char linetag[]="LINE";
98 static const char paragraphtag[]="PARAGRAPH";
99 static const char regiontag[]="REGION";
100 static const char pagecolumntag[]="PAGECOLUMN";
101 static const char hiddentexttag[]="HIDDENTEXT";
102 static const char metadatatag[]="METADATA";
103 
104 class lt_XMLParser::Impl : public lt_XMLParser
105 {
106 public:
107   Impl(void);
108   virtual ~Impl();
109   /// Parse the specified bytestream.
110   virtual void parse(const GP<ByteStream> &bs, GURL *pdjvufile);
111   /// Parse the specified tags - this one does all the work
112   virtual void parse(const lt_XMLTags &tags, GURL *pdjvufile);
113   /// write to disk.
114   virtual void save(void);
115   /// erase.
116   virtual void empty(void);
117 protected:
118   GP<DjVuFile> get_file(const GURL &url,GUTF8String page);
119 
120   void parse_anno(const int width, const int height,
121     const lt_XMLTags &GObject,
122     GMap<GUTF8String,GP<lt_XMLTags> > &Maps, DjVuFile &dfile);
123 
124   void parse_text(const int width, const int height,
125     const lt_XMLTags &GObject, DjVuFile &dfile);
126 
127   void parse_meta(const lt_XMLTags &GObject, DjVuFile &dfile);
128 
129   void ChangeAnno( const int width, const int height,
130     DjVuFile &dfile, const lt_XMLTags &map);
131 
132   void ChangeInfo(DjVuFile &dfile,const int dpi,const double gamma);
133 
134   void ChangeText( const int width, const int height,
135     DjVuFile &dfile, const lt_XMLTags &map);
136 
137   void ChangeMeta( DjVuFile &dfile, const lt_XMLTags &map);
138 
139   void ChangeTextOCR( const GUTF8String &value,
140     const int width, const int height,
141     const GP<DjVuFile> &dfile);
142 
143   // we may want to make these list of modified file static so
144   // they only needed to be loaded and saved once.
145 
146   GMap<GUTF8String,GP<DjVuFile> > m_files;
147   GMap<GUTF8String,GP<DjVuDocument> > m_docs;
148 
149   GURL m_codebase;
150   GCriticalSection xmlparser_lock;
151 };
152 
153 static GP<ByteStream>
154 OCRcallback(
155   void * const xarg,
156   lt_XMLParser::mapOCRcallback * const xcallback,
157   const GUTF8String &value=GUTF8String(),
158   const GP<DjVuImage> &image=0 );
159 
160 static inline GP<ByteStream>
OCRcallback(const GUTF8String & value,const GP<DjVuImage> & image)161 OCRcallback(const GUTF8String &value, const GP<DjVuImage> &image)
162 {
163   return OCRcallback(0,0,value,image);
164 }
165 
lt_XMLParser()166 lt_XMLParser::lt_XMLParser() {}
~lt_XMLParser()167 lt_XMLParser::~lt_XMLParser() {}
Impl()168 lt_XMLParser::Impl::Impl() {}
~Impl()169 lt_XMLParser::Impl::~Impl() {}
170 
171 GP<lt_XMLParser>
create(void)172 lt_XMLParser::create(void)
173 {
174   return new lt_XMLParser::Impl;
175 }
176 
177 // helper function for args
178 static void
intList(GUTF8String coords,GList<int> & retval)179 intList(GUTF8String coords, GList<int> &retval)
180 {
181   int pos=0;
182   while(coords.length())
183   {
184     int epos;
185     unsigned long i=coords.toLong(pos,epos,10);
186     if(epos>=0)
187     {
188       retval.append(i);
189       const int n=coords.nextNonSpace(epos);
190       if(coords[n] != ',')
191         break;
192       pos=n+1;
193     }
194   }
195 }
196 
197 void
empty(void)198 lt_XMLParser::Impl::empty(void)
199 {
200   GCriticalSectionLock lock(&xmlparser_lock);
201   m_files.empty();
202   m_docs.empty();
203 }
204 
205 void
save(void)206 lt_XMLParser::Impl::save(void)
207 {
208   GCriticalSectionLock lock(&xmlparser_lock);
209   for(GPosition pos=m_docs;pos;++pos)
210   {
211     const GP<DjVuDocument> doc(m_docs[pos]);
212     const GURL url=doc->get_init_url();
213 
214     DEBUG_MSG("Saving "<<(const char *)url<<" with new text and annotations\n");
215     const bool bundle=doc->is_bundled()||(doc->get_doc_type()==DjVuDocument::SINGLE_PAGE);
216     doc->save_as(url,bundle);
217   }
218   empty();
219 }
220 
221 void
parse(const GP<ByteStream> & bs,GURL * pdjvufile)222 lt_XMLParser::Impl::parse(const GP<ByteStream> &bs, GURL *pdjvufile)
223 {
224   const GP<lt_XMLTags> tags(lt_XMLTags::create(bs));
225   parse(*tags, pdjvufile);
226 }
227 
228 static const GMap<GUTF8String,GMapArea::BorderType> &
BorderTypeMap(void)229 BorderTypeMap(void)
230 {
231   static GMap<GUTF8String,GMapArea::BorderType> typeMap;
232   if (! typeMap.size())
233     {
234       typeMap["none"]=GMapArea::NO_BORDER;
235       typeMap["xor"]=GMapArea::XOR_BORDER;
236       typeMap["solid"]=GMapArea::SOLID_BORDER;
237       typeMap["default"]=GMapArea::SOLID_BORDER;
238       typeMap["shadowout"]=GMapArea::SHADOW_OUT_BORDER;
239       typeMap["shadowin"]=GMapArea::SHADOW_IN_BORDER;
240       typeMap["etchedin"]=GMapArea::SHADOW_EIN_BORDER;
241       typeMap["etchedout"]=GMapArea::SHADOW_EOUT_BORDER;
242     }
243   return typeMap;
244 }
245 
246 static unsigned long
convertToColor(const GUTF8String & s)247 convertToColor(const GUTF8String &s)
248 {
249   unsigned long retval=0;
250   if(s.length())
251   {
252     int endpos = -1;
253     if(s[0] == '#')
254     {
255       retval=s.substr(1,-1).toULong(0,endpos,16);
256     }
257     if(endpos < 0)
258     {
259       G_THROW( (ERR_MSG("XMLAnno.bad_color") "\t")+s );
260     }
261   }
262   return retval;
263 }
264 
265 void
ChangeInfo(DjVuFile & dfile,const int dpi,const double gamma)266 lt_XMLParser::Impl::ChangeInfo(DjVuFile &dfile,const int dpi,const double gamma)
267 {
268   GP<DjVuInfo> info;
269   if(dpi >= 5 && dpi <= 4800)
270   {
271     dfile.resume_decode(true);
272     if(dfile.info && (dpi != dfile.info->dpi) )
273     {
274       info=new DjVuInfo(*dfile.info);
275       info->dpi=dpi;
276     }
277   }
278   if(gamma >= 0.1 && gamma <= 5.0)
279   {
280     dfile.resume_decode(true);
281     if(dfile.info && (gamma != dfile.info->gamma) )
282     {
283       if(!info)
284         info=new DjVuInfo(*dfile.info);
285       info->gamma=gamma;
286     }
287   }
288   if(info)
289   {
290     dfile.change_info(info);
291   }
292 }
293 
294 void
ChangeAnno(const int width,const int height,DjVuFile & dfile,const lt_XMLTags & map)295 lt_XMLParser::Impl::ChangeAnno(
296   const int width, const int height,
297   DjVuFile &dfile,
298   const lt_XMLTags &map )
299 {
300   dfile.resume_decode(true);
301   const GP<DjVuInfo> info(dfile.info);
302   const GP<DjVuAnno> ganno(DjVuAnno::create());
303   DjVuAnno &anno=*ganno;
304   GPosition map_pos;
305   map_pos=map.contains(areatag);
306   if(dfile.contains_anno())
307   {
308     GP<ByteStream> annobs=dfile.get_merged_anno();
309     if(annobs)
310     {
311       anno.decode(annobs);
312       if(anno.ant && info)
313       {
314         anno.ant->map_areas.empty();
315       }
316     }
317 //    dfile.remove_anno();
318   }
319   if(info && map_pos)
320   {
321     const int h=info->height;
322     const int w=info->width;
323     double ws=1.0;
324     double hs=1.0;
325     if(width && width != w)
326     {
327       ws=((double)w)/((double)width);
328     }
329     if(height && height != h)
330     {
331       hs=((double)h)/((double)height);
332     }
333     if(!anno.ant)
334     {
335       anno.ant=DjVuANT::create();
336     }
337     GPList<GMapArea> &map_areas=anno.ant->map_areas;
338     map_areas.empty();
339     GPList<lt_XMLTags> gareas=map[map_pos];
340     for(GPosition pos=gareas;pos;++pos)
341     {
342       if(gareas[pos])
343       {
344         lt_XMLTags &areas=*(gareas[pos]);
345         GMap<GUTF8String,GUTF8String> args(areas.get_args());
346         GList<int> coords;
347         // ******************************************************
348         // Parse the coords attribute:  first read the raw data into
349         // a list, then scale the x, y data into another list.  For
350         // circles, you also get a radius element with (looks like an x
351         // with no matching y).
352         // ******************************************************
353         {
354           GPosition coords_pos=args.contains("coords");
355           if(coords_pos)
356           {
357             GList<int> raw_coords;
358             intList(args[coords_pos],raw_coords);
359             for(GPosition raw_pos=raw_coords;raw_pos;++raw_pos)
360             {
361               const int r=raw_coords[raw_pos];
362               const int x=(int)(ws*(double)r+0.5);
363               coords.append(x);
364               int y=h-1;
365               if(! ++raw_pos)
366               {
367                 y-=(int)(hs*(double)r+0.5);
368               }else
369               {
370                 y-=(int)(hs*(double)raw_coords[raw_pos]+0.5);
371               }
372               coords.append(y);
373 //            DjVuPrintMessage("Coords (%d,%d)\n",x,y);
374             }
375           }
376         }
377         GUTF8String shape;
378         {
379           GPosition shape_pos=args.contains("shape");
380           if(shape_pos)
381           {
382             shape=args[shape_pos];
383           }
384         }
385         GP<GMapArea> a;
386         if(shape == "default")
387         {
388           GRect rect(0,0,w,h);
389           a=GMapRect::create(rect);
390         }else if(!shape.length() || shape == "rect")
391         {
392           int xx[4];
393           int i=0;
394           for(GPosition rect_pos=coords;(rect_pos)&&(i<4);++rect_pos,++i)
395           {
396             xx[i]=coords[rect_pos];
397           }
398           if(i!=4)
399           {
400             G_THROW( ERR_MSG("XMLAnno.bad_rect") );
401           }
402           int xmin,xmax;
403           if(xx[0]>xx[2])
404           {
405             xmax=xx[0];
406             xmin=xx[2];
407           }else
408           {
409             xmin=xx[0];
410             xmax=xx[2];
411           }
412           int ymin,ymax;
413           if(xx[1]>xx[3])
414           {
415             ymax=xx[1];
416             ymin=xx[3];
417           }else
418           {
419             ymin=xx[1];
420             ymax=xx[3];
421           }
422           GRect rect(xmin,ymin,xmax-xmin,ymax-ymin);
423           a=GMapRect::create(rect);
424         }else if(shape == "circle")
425         {
426           int xx[4];
427           int i=0;
428           GPosition rect_pos=coords.lastpos();
429           if(rect_pos)
430           {
431             coords.append(coords[rect_pos]);
432             for(rect_pos=coords;(rect_pos)&&(i<4);++rect_pos)
433             {
434               xx[i++]=coords[rect_pos];
435             }
436           }
437           if(i!=4)
438           {
439             G_THROW( ERR_MSG("XMLAnno.bad_circle") );
440           }
441           int x=xx[0],y=xx[1],rx=xx[2],ry=(h-xx[3])-1;
442           GRect rect(x-rx,y-ry,2*rx,2*ry);
443           a=GMapOval::create(rect);
444         }else if(shape == "oval")
445         {
446           int xx[4];
447           int i=0;
448           for(GPosition rect_pos=coords;(rect_pos)&&(i<4);++rect_pos,++i)
449           {
450             xx[i]=coords[rect_pos];
451           }
452           if(i!=4)
453           {
454             G_THROW( ERR_MSG("XMLAnno.bad_oval") );
455           }
456           int xmin,xmax;
457           if(xx[0]>xx[2])
458           {
459             xmax=xx[0];
460             xmin=xx[2];
461           }else
462           {
463             xmin=xx[0];
464             xmax=xx[2];
465           }
466           int ymin,ymax;
467           if(xx[1]>xx[3])
468           {
469             ymax=xx[1];
470             ymin=xx[3];
471           }else
472           {
473             ymin=xx[1];
474             ymax=xx[3];
475           }
476           GRect rect(xmin,ymin,xmax-xmin,ymax-ymin);
477           a=GMapOval::create(rect);
478         }else if(shape == "poly")
479         {
480           GP<GMapPoly> p=GMapPoly::create();
481           for(GPosition poly_pos=coords;poly_pos;++poly_pos)
482           {
483             int x=coords[poly_pos];
484             if(! ++poly_pos)
485               break;
486             int y=coords[poly_pos];
487             p->add_vertex(x,y);
488           }
489           p->close_poly();
490           a=p;
491         }else
492         {
493           G_THROW( ( ERR_MSG("XMLAnno.unknown_shape") "\t")+shape );
494         }
495         if(a)
496         {
497           GPosition pos;
498           if((pos=args.contains("href")))
499           {
500             a->url=args[pos];
501           }
502           if((pos=args.contains("target")))
503           {
504             a->target=args[pos];
505           }
506           if((pos=args.contains("alt")))
507           {
508             a->comment=args[pos];
509           }
510           if((pos=args.contains("bordertype")))
511           {
512             GUTF8String b=args[pos];
513             static const GMap<GUTF8String,GMapArea::BorderType> typeMap=BorderTypeMap();
514             if((pos=typeMap.contains(b)))
515             {
516               a->border_type=typeMap[pos];
517             }else
518             {
519               G_THROW( (ERR_MSG("XMLAnno.unknown_border") "\t")+b );
520             }
521           }
522           a->border_always_visible=!!args.contains("visible");
523           if((pos=args.contains("bordercolor")))
524           {
525             a->border_color=convertToColor(args[pos]);
526           }
527           if((pos=args.contains("highlight")))
528           {
529             a->hilite_color=convertToColor(args[pos]);
530           }
531           if((pos=args.contains("border")))
532           {
533              a->border_width=args[pos].toInt(); //atoi(args[pos]);
534           }
535           map_areas.append(a);
536         }
537       }
538     }
539   }
540   dfile.set_modified(true);
541   dfile.anno=ByteStream::create();
542   anno.encode(dfile.anno);
543 }
544 
545 GP<DjVuFile>
get_file(const GURL & url,GUTF8String id)546 lt_XMLParser::Impl::get_file(const GURL &url,GUTF8String id)
547 {
548   GP<DjVuFile> dfile;
549   GP<DjVuDocument> doc;
550   GCriticalSectionLock lock(&xmlparser_lock);
551   {
552     GPosition pos=m_docs.contains(url.get_string());
553     if(pos)
554     {
555       doc=m_docs[pos];
556     }else
557     {
558       doc=DjVuDocument::create_wait(url);
559       if(! doc->wait_for_complete_init())
560       {
561         G_THROW(( ERR_MSG("XMLAnno.fail_init") "\t")+url.get_string() );
562       }
563       m_docs[url.get_string()]=doc;
564     }
565     if(id.is_int())
566     {
567       const int xpage=id.toInt(); //atoi((char const *)page);
568       if(xpage>0)
569         id=doc->page_to_id(xpage-1);
570     }else if(!id.length())
571     {
572       id=doc->page_to_id(0);
573     }
574   }
575   const GURL fileurl(doc->id_to_url(id));
576   GPosition dpos(m_files.contains(fileurl.get_string()));
577   if(!dpos)
578   {
579     if(!doc->get_id_list().contains(id))
580     {
581       G_THROW( ERR_MSG("XMLAnno.bad_page") );
582     }
583     dfile=doc->get_djvu_file(id,false);
584     if(!dfile)
585     {
586       G_THROW( ERR_MSG("XMLAnno.bad_page") );
587     }
588     m_files[fileurl.get_string()]=dfile;
589   }else
590   {
591     dfile=m_files[dpos];
592   }
593   return dfile;
594 }
595 
596 void
parse(const lt_XMLTags & tags,GURL * pdjvufile)597 lt_XMLParser::Impl::parse(const lt_XMLTags &tags, GURL *pdjvufile)
598 {
599   const GPList<lt_XMLTags> Body(tags.get_Tags(bodytag));
600   GPosition pos=Body;
601 
602   if(!pos || (pos != Body.lastpos()))
603   {
604     G_THROW( ERR_MSG("XMLAnno.extra_body") );
605   }
606   const GP<lt_XMLTags> GBody(Body[pos]);
607   if(!GBody)
608   {
609     G_THROW( ERR_MSG("XMLAnno.no_body") );
610   }
611 
612   GMap<GUTF8String,GP<lt_XMLTags> > Maps;
613   lt_XMLTags::get_Maps(maptag,"name",Body,Maps);
614 
615   const GPList<lt_XMLTags> Objects(GBody->get_Tags(objecttag));
616   lt_XMLTags::get_Maps(maptag,"name",Objects,Maps);
617 
618   for(GPosition Objpos=Objects;Objpos;++Objpos)
619   {
620     lt_XMLTags &GObject=*Objects[Objpos];
621     // Map of attributes to value (e.g. "width" --> "500")
622     const GMap<GUTF8String,GUTF8String> &args=GObject.get_args();
623     GURL codebase;
624     {
625       DEBUG_MSG("Setting up codebase... m_codebase = " << m_codebase << "\n");
626       GPosition codebasePos=args.contains("codebase");
627       // If user specified a codebase attribute, assume it is correct (absolute URL):
628       //  the GURL constructor will throw an exception if it isn't
629       if(codebasePos)
630       {
631         codebase=GURL::UTF8(args[codebasePos]);
632       }else if (m_codebase.is_dir())
633       {
634         codebase=m_codebase;
635       }else
636       {
637         codebase=GURL::Filename::UTF8(GOS::cwd());
638       }
639       DEBUG_MSG("codebase = " << codebase << "\n");
640     }
641     // the data attribute specifies the input file.  This can be
642     //  either an absolute URL (starts with file:/) or a relative
643     //  URL (for now, just a path and file name).  If it's absolute,
644     //  our GURL will adequately wrap it.  If it's relative, we need
645     //  to use the codebase attribute to form an absolute URL first.
646     GPosition datapos=args.contains("data");
647     if(datapos)
648     {
649       GPosition typePos(args.contains("type"));
650       if(typePos)
651         {
652           if(args[typePos] != mimetype)
653           continue;
654         }
655       const GURL url = (pdjvufile) ? *pdjvufile
656         : GURL::UTF8(args[datapos],
657                      (args[datapos][0] == '/') ? codebase.base() : codebase);
658       int width;
659       {
660         GPosition widthPos=args.contains("width");
661         width=(widthPos)?args[widthPos].toInt():0;
662       }
663       int height;
664       {
665         GPosition heightPos=args.contains("height");
666         height=(heightPos)?args[heightPos].toInt():0;
667       }
668       GUTF8String gamma;
669       GUTF8String dpi;
670       GUTF8String page;
671       GUTF8String do_ocr;
672       {
673         GPosition paramPos(GObject.contains(paramtag));
674         if(paramPos)
675         {
676           const GPList<lt_XMLTags> Params(GObject[paramPos]);
677           for(GPosition loc=Params;loc;++loc)
678           {
679             const GMap<GUTF8String,GUTF8String> &pargs=Params[loc]->get_args();
680             GPosition namepos=pargs.contains("name");
681             if(namepos)
682             {
683               GPosition valuepos=pargs.contains("value");
684               if(valuepos)
685               {
686                 const GUTF8String name=pargs[namepos].downcase();
687                 const GUTF8String &value=pargs[valuepos];
688                 if(name == "flags")
689                 {
690                   GMap<GUTF8String,GUTF8String> args;
691                   lt_XMLTags::ParseValues(value,args,true);
692                   if(args.contains("page"))
693                   {
694                     page=args["page"];
695                   }
696                   if(args.contains("dpi"))
697                   {
698                     dpi=args["dpi"];
699                   }
700                   if(args.contains("gamma"))
701                   {
702                     gamma=args["gamma"];
703                   }
704                   if(args.contains("ocr"))
705                   {
706                     do_ocr=args["ocr"];
707                   }
708                 }else if(name == "page")
709                 {
710                   page=value;
711                 }else if(name == "dpi")
712                 {
713                   dpi=value;
714                 }else if(name == "gamma")
715                 {
716                   gamma=value;
717                 }else if(name == "ocr")
718                 {
719                   do_ocr=value;
720                 }
721               }
722             }
723           }
724         }
725       }
726       const GP<DjVuFile> dfile(get_file(url,page));
727       if(dpi.is_int() || gamma.is_float())
728       {
729         int pos=0;
730         ChangeInfo(*dfile,dpi.toInt(),gamma.toDouble(pos,pos));
731       }
732       parse_anno(width,height,GObject,Maps,*dfile);
733       parse_meta(GObject,*dfile);
734       parse_text(width,height,GObject,*dfile);
735       ChangeTextOCR(do_ocr,width,height,dfile);
736     }
737   }
738 }
739 
740 void
parse_anno(const int width,const int height,const lt_XMLTags & GObject,GMap<GUTF8String,GP<lt_XMLTags>> & Maps,DjVuFile & dfile)741 lt_XMLParser::Impl::parse_anno(
742   const int width,
743   const int height,
744   const lt_XMLTags &GObject,
745   GMap<GUTF8String,GP<lt_XMLTags> > &Maps,
746   DjVuFile &dfile )
747 {
748   GP<lt_XMLTags> map;
749   {
750     GPosition usemappos=GObject.get_args().contains("usemap");
751     if(usemappos)
752     {
753       const GUTF8String mapname(GObject.get_args()[usemappos]);
754       GPosition mappos=Maps.contains(mapname);
755       if(!mappos)
756       {
757         G_THROW((ERR_MSG("XMLAnno.map_find") "\t")+mapname );
758       }else
759       {
760         map=Maps[mappos];
761       }
762     }
763   }
764   if(map)
765   {
766     ChangeAnno(width,height,dfile,*map);
767   }
768 }
769 
770 #ifdef max
771 #undef max
772 #endif
773 template<class TYPE>
max(TYPE a,TYPE b)774 static inline TYPE max(TYPE a,TYPE b) { return (a>b)?a:b; }
775 #ifdef min
776 #undef min
777 #endif
778 template<class TYPE>
min(TYPE a,TYPE b)779 static inline TYPE min(TYPE a,TYPE b) { return (a<b)?a:b; }
780 
781 // used to build the zone tree
782 // true is returned if the GRect is known for this object,
783 // and false, if the rectangle's size is just the parent size.
784 static bool
make_child_layer(DjVuTXT::Zone & parent,const lt_XMLTags & tag,ByteStream & bs,const int height,const double ws,const double hs)785 make_child_layer(
786   DjVuTXT::Zone &parent,
787   const lt_XMLTags &tag, ByteStream &bs,
788   const int height, const double ws, const double hs)
789 {
790   bool retval=true;
791   // the plugin thinks there are only Pages, Lines and Words
792   // so we don't make Paragraphs, Regions and Columns zones
793   // if we did the plugin is not able to search the text but
794   // DjVuToText writes out all the text anyway
795   DjVuTXT::Zone *self_ptr;
796   char sepchar;
797   const GUTF8String name(tag.get_name());
798   if(name == charactertag)
799   {
800     self_ptr=parent.append_child();
801     self_ptr->ztype = DjVuTXT::CHARACTER;
802     sepchar=0;
803   }else if(name == wordtag)
804   {
805     self_ptr=parent.append_child();
806     self_ptr->ztype = DjVuTXT::WORD;
807     sepchar=' ';
808   }else if(name == linetag)
809   {
810     self_ptr=parent.append_child();
811     self_ptr->ztype = DjVuTXT::LINE;
812     sepchar=DjVuTXT::end_of_line;
813   }else if(name == paragraphtag)
814   {
815     self_ptr=parent.append_child();
816     self_ptr->ztype = DjVuTXT::PARAGRAPH;
817     sepchar=DjVuTXT::end_of_paragraph;
818   }else if(name == regiontag)
819   {
820     self_ptr=parent.append_child();
821     self_ptr->ztype = DjVuTXT::REGION;
822     sepchar=DjVuTXT::end_of_region;
823   }else if(name == pagecolumntag)
824   {
825     self_ptr=parent.append_child();
826     self_ptr->ztype = DjVuTXT::COLUMN;
827     sepchar=DjVuTXT::end_of_column;
828   }else
829   {
830     self_ptr = &parent;
831     self_ptr->ztype = DjVuTXT::PAGE;
832     sepchar=0;
833   }
834   DjVuTXT::Zone &self = *self_ptr;
835   self.text_start = bs.tell();
836   int &xmin=self.rect.xmin, &ymin=self.rect.ymin,
837     &xmax=self.rect.xmax, &ymax=self.rect.ymax;
838   GRect default_rect;
839   default_rect.xmin=max(parent.rect.xmax,parent.rect.xmin);
840   default_rect.xmax=min(parent.rect.xmax,parent.rect.xmin);
841   default_rect.ymin=max(parent.rect.ymax,parent.rect.ymin);
842   default_rect.ymax=min(parent.rect.ymax,parent.rect.ymin);
843   // Now if there are coordinates, use those.
844   GPosition pos(tag.get_args().contains("coords"));
845   if(pos)
846   {
847     GList<int> rectArgs;
848     intList(tag.get_args()[pos], rectArgs);
849     if((pos=rectArgs))
850     {
851       xmin=(int)(ws*(double)rectArgs[pos]);
852       if(++pos)
853       {
854         ymin=(height-1)-(int)(hs*(double)rectArgs[pos]);
855         if(++pos)
856         {
857           xmax=(int)(ws*(double)rectArgs[pos]);
858           if(++pos)
859           {
860             ymax=(height-1)-(int)(hs*(double)rectArgs[pos]);
861             if(xmin>xmax) // Make sure xmin is really minimum
862             {
863               const int t=xmin;
864               xmin=xmax;
865               xmax=t;
866             }
867             if(ymin>ymax) // Make sure ymin is really minimum
868             {
869               const int t=ymin;
870               ymin=ymax;
871               ymax=t;
872             }
873           }
874         }
875       }
876     }
877   }
878   if(self.ztype == DjVuTXT::CHARACTER)
879   {
880     if(! pos)
881     {
882       self.rect=default_rect;
883       retval=false;
884     }
885     const GUTF8String raw(tag.get_raw().fromEscaped());
886     const int i=raw.nextNonSpace(0);
887     bs.writestring(raw.substr(i,raw.firstEndSpace(i)-i));
888     if(sepchar)
889       bs.write8(sepchar);
890     self.text_length = bs.tell() - self.text_start;
891   }else if(pos)
892   {
893     pos=tag.get_content();
894     if(pos)
895     {
896       for(pos=tag.get_content(); pos; ++pos)
897       {
898         const GP<lt_XMLTags> t(tag.get_content()[pos].tag);
899         make_child_layer(self, *t, bs, height,ws,hs);
900       }
901       if(sepchar)
902         bs.write8(sepchar);
903       self.text_length = bs.tell() - self.text_start;
904     }else
905     {
906       const GUTF8String raw(tag.get_raw().fromEscaped());
907       const int i=raw.nextNonSpace(0);
908       bs.writestring(raw.substr(i,raw.firstEndSpace(i)-i));
909       if(sepchar)
910         bs.write8(sepchar);
911       self.text_length = bs.tell() - self.text_start;
912     }
913   }else
914   {
915     self.rect=default_rect;
916     if((pos=tag.get_content()))
917     {
918       do
919       {
920         const GP<lt_XMLTags> t(tag.get_content()[pos].tag);
921         const GRect save_rect(self.rect);
922         self.rect=default_rect;
923 	if ((retval = make_child_layer(self, *t, bs, height, ws, hs)))
924         {
925           xmin=min(save_rect.xmin,xmin);
926           xmax=max(save_rect.xmax,xmax);
927           ymin=min(save_rect.ymin,ymin);
928           ymax=max(save_rect.ymax,ymax);
929         }else
930         {
931           // If the child doesn't have coordinates, we need to use a box
932           // at least as big as the parent's coordinates.
933           xmin=min(save_rect.xmin,default_rect.xmax);
934           xmax=max(save_rect.xmax,default_rect.xmin);
935           ymin=min(save_rect.ymin,default_rect.ymax);
936           ymax=max(save_rect.ymax,default_rect.ymin);
937           for(; pos; ++pos)
938           {
939             const GP<lt_XMLTags> t(tag.get_content()[pos].tag);
940             make_child_layer(self, *t, bs, height,ws,hs);
941           }
942           break;
943         }
944       } while(++pos);
945       if(sepchar)
946         bs.write8(sepchar);
947       self.text_length = bs.tell() - self.text_start;
948     }else
949     {
950       const GUTF8String raw(tag.get_raw().fromEscaped());
951       const int i=raw.nextNonSpace(0);
952       bs.writestring(raw.substr(i,raw.firstEndSpace(i)-i));
953       if(sepchar)
954         bs.write8(sepchar);
955       self.text_length = bs.tell() - self.text_start;
956     }
957   }
958   parent.rect.xmin=min(xmin,parent.rect.xmin);
959   parent.rect.ymin=min(ymin,parent.rect.ymin);
960   parent.rect.xmax=max(xmax,parent.rect.xmax);
961   parent.rect.ymax=max(ymax,parent.rect.ymax);
962   if(xmin>xmax)
963   {
964     const int t=xmin;
965     xmin=xmax;
966     xmax=t;
967   }
968   if(ymin>ymax)
969   {
970     const int t=ymin;
971     ymin=ymax;
972     ymax=t;
973   }
974 //  DjVuPrintMessage("(%d,%d)(%d,%d)<<<\\%o>>>\n",
975 //    xmin,ymin,xmax,ymax, sepchar);
976   return retval;
977 }
978 
979 void
ChangeTextOCR(const GUTF8String & value,const int width,const int height,const GP<DjVuFile> & dfile)980 lt_XMLParser::Impl::ChangeTextOCR(
981   const GUTF8String &value,
982   const int width,
983   const int height,
984   const GP<DjVuFile> &dfile)
985 {
986   if(value.length() && value.downcase() != "false")
987   {
988     const GP<ByteStream> bs=OCRcallback(value,DjVuImage::create(dfile));
989     if( bs && bs->size() )
990     {
991       const GP<lt_XMLTags> tags(lt_XMLTags::create(bs));
992       ChangeText(width,height,*dfile,*tags);
993     }
994   }
995 }
996 
997 void
ChangeMeta(DjVuFile & dfile,const lt_XMLTags & tags)998 lt_XMLParser::Impl::ChangeMeta(
999   DjVuFile &dfile, const lt_XMLTags &tags )
1000 {
1001   dfile.resume_decode(true);
1002   GP<ByteStream> gbs(ByteStream::create());
1003   tags.write(*gbs,false);
1004   gbs->seek(0L);
1005   GUTF8String raw(gbs->getAsUTF8());
1006   if(raw.length())
1007   {
1008      //GUTF8String gs="<"+(metadatatag+(">"+raw))+"</"+metadatatag+">\n");
1009     dfile.change_meta(raw+"\n");
1010   }else
1011   {
1012     dfile.change_meta(GUTF8String());
1013   }
1014 }
1015 
1016 void
ChangeText(const int width,const int height,DjVuFile & dfile,const lt_XMLTags & tags)1017 lt_XMLParser::Impl::ChangeText(
1018   const int width, const int height,
1019   DjVuFile &dfile, const lt_XMLTags &tags )
1020 {
1021   dfile.resume_decode(true);
1022 
1023   GP<DjVuText> text = DjVuText::create();
1024   GP<DjVuTXT> txt = text->txt = DjVuTXT::create();
1025 
1026   // to store the new text
1027   GP<ByteStream> textbs = ByteStream::create();
1028 
1029   GP<DjVuInfo> info=(dfile.info);
1030   if(info)
1031   {
1032     const int h=info->height;
1033     const int w=info->width;
1034     txt->page_zone.text_start = 0;
1035     DjVuTXT::Zone &parent=txt->page_zone;
1036     parent.rect.xmin=0;
1037     parent.rect.ymin=0;
1038     parent.rect.ymax=h;
1039     parent.rect.xmax=w;
1040     double ws=1.0;
1041     if(width && width != w)
1042     {
1043       ws=((double)w)/((double)width);
1044     }
1045     double hs=1.0;
1046     if(height && height != h)
1047     {
1048       hs=((double)h)/((double)height);
1049     }
1050     make_child_layer(parent, tags, *textbs, h, ws,hs);
1051     textbs->write8(0);
1052     long len = textbs->tell();
1053     txt->page_zone.text_length = len;
1054     textbs->seek(0,SEEK_SET);
1055     textbs->read(txt->textUTF8.getbuf(len), len);
1056 
1057     dfile.change_text(txt,false);
1058   }
1059 }
1060 
1061 void
parse_text(const int width,const int height,const lt_XMLTags & GObject,DjVuFile & dfile)1062 lt_XMLParser::Impl::parse_text(
1063   const int width,
1064   const int height,
1065   const lt_XMLTags &GObject,
1066   DjVuFile &dfile )
1067 {
1068   GPosition textPos = GObject.contains(hiddentexttag);
1069   if(textPos)
1070   {
1071     // loop through the hidden text - there should only be one
1072     // if there are more ??only the last one will be saved??
1073     GPList<lt_XMLTags> textTags = GObject[textPos];
1074     GPosition pos = textTags;
1075     ChangeText(width,height,dfile,*textTags[pos]);
1076   }
1077 }
1078 
1079 void
parse_meta(const lt_XMLTags & GObject,DjVuFile & dfile)1080 lt_XMLParser::Impl::parse_meta(
1081   const lt_XMLTags &GObject,
1082   DjVuFile &dfile )
1083 {
1084   GPosition metaPos = GObject.contains(metadatatag);
1085   if(metaPos)
1086   {
1087     // loop through the hidden text - there should only be one
1088     // if there are more ??only the last one will be saved??
1089     GPList<lt_XMLTags> metaTags = GObject[metaPos];
1090     GPosition pos = metaTags;
1091     ChangeMeta(dfile,*metaTags[pos]);
1092   }
1093 }
1094 
1095 static GP<ByteStream>
OCRcallback(void * const xarg,lt_XMLParser::mapOCRcallback * const xcallback,const GUTF8String & value,const GP<DjVuImage> & image)1096 OCRcallback(
1097   void * const xarg,
1098   lt_XMLParser::mapOCRcallback * const xcallback,
1099   const GUTF8String &value,
1100   const GP<DjVuImage> &image )
1101 {
1102   GP<ByteStream> retval;
1103   static void *arg=0;
1104   static lt_XMLParser::mapOCRcallback *callback=0;
1105   if(image)
1106   {
1107     if(callback)
1108       retval=callback(arg,value,image);
1109   }else
1110   {
1111     arg=xarg;
1112     callback=xcallback;
1113   }
1114   return retval;
1115 }
1116 
1117 void
setOCRcallback(void * const arg,mapOCRcallback * const callback)1118 lt_XMLParser::setOCRcallback(
1119   void * const arg,
1120   mapOCRcallback * const callback)
1121 {
1122   ::OCRcallback(arg,callback);
1123 }
1124 
1125 
1126 #ifdef HAVE_NAMESPACES
1127 }
1128 # ifndef NOT_USING_DJVU_NAMESPACE
1129 using namespace DJVU;
1130 # endif
1131 #endif
1132