1 //C- -*- C++ -*-
2 //C- -------------------------------------------------------------------
3 //C- DjVuLibre-3.5
4 //C- Copyright (c) 2002 Leon Bottou and Yann Le Cun.
5 //C- Copyright (c) 2001 AT&T
6 //C-
7 //C- This software is subject to, and may be distributed under, the
8 //C- GNU General Public License, either Version 2 of the license,
9 //C- or (at your option) any later version. The license should have
10 //C- accompanied the software or you may obtain a copy of the license
11 //C- from the Free Software Foundation at http://www.fsf.org .
12 //C-
13 //C- This program is distributed in the hope that it will be useful,
14 //C- but WITHOUT ANY WARRANTY; without even the implied warranty of
15 //C- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 //C- GNU General Public License for more details.
17 //C-
18 //C- DjVuLibre-3.5 is derived from the DjVu(r) Reference Library from
19 //C- Lizardtech Software. Lizardtech Software has authorized us to
20 //C- replace the original DjVu(r) Reference Library notice by the following
21 //C- text (see doc/lizard2002.djvu and doc/lizardtech2007.djvu):
22 //C-
23 //C- ------------------------------------------------------------------
24 //C- | DjVu (r) Reference Library (v. 3.5)
25 //C- | Copyright (c) 1999-2001 LizardTech, Inc. All Rights Reserved.
26 //C- | The DjVu Reference Library is protected by U.S. Pat. No.
27 //C- | 6,058,214 and patents pending.
28 //C- |
29 //C- | This software is subject to, and may be distributed under, the
30 //C- | GNU General Public License, either Version 2 of the license,
31 //C- | or (at your option) any later version. The license should have
32 //C- | accompanied the software or you may obtain a copy of the license
33 //C- | from the Free Software Foundation at http://www.fsf.org .
34 //C- |
35 //C- | The computer code originally released by LizardTech under this
36 //C- | license and unmodified by other parties is deemed "the LIZARDTECH
37 //C- | ORIGINAL CODE." Subject to any third party intellectual property
38 //C- | claims, LizardTech grants recipient a worldwide, royalty-free,
39 //C- | non-exclusive license to make, use, sell, or otherwise dispose of
40 //C- | the LIZARDTECH ORIGINAL CODE or of programs derived from the
41 //C- | LIZARDTECH ORIGINAL CODE in compliance with the terms of the GNU
42 //C- | General Public License. This grant only confers the right to
43 //C- | infringe patent claims underlying the LIZARDTECH ORIGINAL CODE to
44 //C- | the extent such infringement is reasonably necessary to enable
45 //C- | recipient to make, have made, practice, sell, or otherwise dispose
46 //C- | of the LIZARDTECH ORIGINAL CODE (or portions thereof) and not to
47 //C- | any greater extent that may be necessary to utilize further
48 //C- | modifications or combinations.
49 //C- |
50 //C- | The LIZARDTECH ORIGINAL CODE is provided "AS IS" WITHOUT WARRANTY
51 //C- | OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
52 //C- | TO ANY WARRANTY OF NON-INFRINGEMENT, OR ANY IMPLIED WARRANTY OF
53 //C- | MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
54 //C- +------------------------------------------------------------------
55
56 #ifdef HAVE_CONFIG_H
57 # include "config.h"
58 #endif
59 #if NEED_GNUG_PRAGMAS
60 # pragma implementation
61 #endif
62
63 // From: Leon Bottou, 1/31/2002
64 // This is purely Lizardtech stuff.
65
66 #include "XMLParser.h"
67 #include "XMLTags.h"
68 #include "ByteStream.h"
69 #include "GOS.h"
70 #include "DjVuDocument.h"
71 #include "DjVuText.h"
72 #include "DjVuAnno.h"
73 #include "DjVuFile.h"
74 #include "DjVuImage.h"
75 #include "debug.h"
76 #include <stdio.h>
77 #include <ctype.h>
78 #include <stddef.h>
79 #include <stdlib.h>
80
81
82 #ifdef HAVE_NAMESPACES
83 namespace DJVU {
84 # ifdef NOT_DEFINED // Just to fool emacs c++ mode
85 }
86 #endif
87 #endif
88
89 static const char mimetype[]="image/x.djvu";
90 static const char bodytag[]="BODY";
91 static const char areatag[]="AREA";
92 static const char maptag[]="MAP";
93 static const char objecttag[]="OBJECT";
94 static const char paramtag[]="PARAM";
95 static const char charactertag[]="CHARACTER";
96 static const char wordtag[]="WORD";
97 static const char linetag[]="LINE";
98 static const char paragraphtag[]="PARAGRAPH";
99 static const char regiontag[]="REGION";
100 static const char pagecolumntag[]="PAGECOLUMN";
101 static const char hiddentexttag[]="HIDDENTEXT";
102 static const char metadatatag[]="METADATA";
103
104 class lt_XMLParser::Impl : public lt_XMLParser
105 {
106 public:
107 Impl(void);
108 virtual ~Impl();
109 /// Parse the specified bytestream.
110 virtual void parse(const GP<ByteStream> &bs, GURL *pdjvufile);
111 /// Parse the specified tags - this one does all the work
112 virtual void parse(const lt_XMLTags &tags, GURL *pdjvufile);
113 /// write to disk.
114 virtual void save(void);
115 /// erase.
116 virtual void empty(void);
117 protected:
118 GP<DjVuFile> get_file(const GURL &url,GUTF8String page);
119
120 void parse_anno(const int width, const int height,
121 const lt_XMLTags &GObject,
122 GMap<GUTF8String,GP<lt_XMLTags> > &Maps, DjVuFile &dfile);
123
124 void parse_text(const int width, const int height,
125 const lt_XMLTags &GObject, DjVuFile &dfile);
126
127 void parse_meta(const lt_XMLTags &GObject, DjVuFile &dfile);
128
129 void ChangeAnno( const int width, const int height,
130 DjVuFile &dfile, const lt_XMLTags &map);
131
132 void ChangeInfo(DjVuFile &dfile,const int dpi,const double gamma);
133
134 void ChangeText( const int width, const int height,
135 DjVuFile &dfile, const lt_XMLTags &map);
136
137 void ChangeMeta( DjVuFile &dfile, const lt_XMLTags &map);
138
139 void ChangeTextOCR( const GUTF8String &value,
140 const int width, const int height,
141 const GP<DjVuFile> &dfile);
142
143 // we may want to make these list of modified file static so
144 // they only needed to be loaded and saved once.
145
146 GMap<GUTF8String,GP<DjVuFile> > m_files;
147 GMap<GUTF8String,GP<DjVuDocument> > m_docs;
148
149 GURL m_codebase;
150 GCriticalSection xmlparser_lock;
151 };
152
153 static GP<ByteStream>
154 OCRcallback(
155 void * const xarg,
156 lt_XMLParser::mapOCRcallback * const xcallback,
157 const GUTF8String &value=GUTF8String(),
158 const GP<DjVuImage> &image=0 );
159
160 static inline GP<ByteStream>
OCRcallback(const GUTF8String & value,const GP<DjVuImage> & image)161 OCRcallback(const GUTF8String &value, const GP<DjVuImage> &image)
162 {
163 return OCRcallback(0,0,value,image);
164 }
165
lt_XMLParser()166 lt_XMLParser::lt_XMLParser() {}
~lt_XMLParser()167 lt_XMLParser::~lt_XMLParser() {}
Impl()168 lt_XMLParser::Impl::Impl() {}
~Impl()169 lt_XMLParser::Impl::~Impl() {}
170
171 GP<lt_XMLParser>
create(void)172 lt_XMLParser::create(void)
173 {
174 return new lt_XMLParser::Impl;
175 }
176
177 // helper function for args
178 static void
intList(GUTF8String coords,GList<int> & retval)179 intList(GUTF8String coords, GList<int> &retval)
180 {
181 int pos=0;
182 while(coords.length())
183 {
184 int epos;
185 unsigned long i=coords.toLong(pos,epos,10);
186 if(epos>=0)
187 {
188 retval.append(i);
189 const int n=coords.nextNonSpace(epos);
190 if(coords[n] != ',')
191 break;
192 pos=n+1;
193 }
194 }
195 }
196
197 void
empty(void)198 lt_XMLParser::Impl::empty(void)
199 {
200 GCriticalSectionLock lock(&xmlparser_lock);
201 m_files.empty();
202 m_docs.empty();
203 }
204
205 void
save(void)206 lt_XMLParser::Impl::save(void)
207 {
208 GCriticalSectionLock lock(&xmlparser_lock);
209 for(GPosition pos=m_docs;pos;++pos)
210 {
211 const GP<DjVuDocument> doc(m_docs[pos]);
212 const GURL url=doc->get_init_url();
213
214 DEBUG_MSG("Saving "<<(const char *)url<<" with new text and annotations\n");
215 const bool bundle=doc->is_bundled()||(doc->get_doc_type()==DjVuDocument::SINGLE_PAGE);
216 doc->save_as(url,bundle);
217 }
218 empty();
219 }
220
221 void
parse(const GP<ByteStream> & bs,GURL * pdjvufile)222 lt_XMLParser::Impl::parse(const GP<ByteStream> &bs, GURL *pdjvufile)
223 {
224 const GP<lt_XMLTags> tags(lt_XMLTags::create(bs));
225 parse(*tags, pdjvufile);
226 }
227
228 static const GMap<GUTF8String,GMapArea::BorderType> &
BorderTypeMap(void)229 BorderTypeMap(void)
230 {
231 static GMap<GUTF8String,GMapArea::BorderType> typeMap;
232 if (! typeMap.size())
233 {
234 typeMap["none"]=GMapArea::NO_BORDER;
235 typeMap["xor"]=GMapArea::XOR_BORDER;
236 typeMap["solid"]=GMapArea::SOLID_BORDER;
237 typeMap["default"]=GMapArea::SOLID_BORDER;
238 typeMap["shadowout"]=GMapArea::SHADOW_OUT_BORDER;
239 typeMap["shadowin"]=GMapArea::SHADOW_IN_BORDER;
240 typeMap["etchedin"]=GMapArea::SHADOW_EIN_BORDER;
241 typeMap["etchedout"]=GMapArea::SHADOW_EOUT_BORDER;
242 }
243 return typeMap;
244 }
245
246 static unsigned long
convertToColor(const GUTF8String & s)247 convertToColor(const GUTF8String &s)
248 {
249 unsigned long retval=0;
250 if(s.length())
251 {
252 int endpos = -1;
253 if(s[0] == '#')
254 {
255 retval=s.substr(1,-1).toULong(0,endpos,16);
256 }
257 if(endpos < 0)
258 {
259 G_THROW( (ERR_MSG("XMLAnno.bad_color") "\t")+s );
260 }
261 }
262 return retval;
263 }
264
265 void
ChangeInfo(DjVuFile & dfile,const int dpi,const double gamma)266 lt_XMLParser::Impl::ChangeInfo(DjVuFile &dfile,const int dpi,const double gamma)
267 {
268 GP<DjVuInfo> info;
269 if(dpi >= 5 && dpi <= 4800)
270 {
271 dfile.resume_decode(true);
272 if(dfile.info && (dpi != dfile.info->dpi) )
273 {
274 info=new DjVuInfo(*dfile.info);
275 info->dpi=dpi;
276 }
277 }
278 if(gamma >= 0.1 && gamma <= 5.0)
279 {
280 dfile.resume_decode(true);
281 if(dfile.info && (gamma != dfile.info->gamma) )
282 {
283 if(!info)
284 info=new DjVuInfo(*dfile.info);
285 info->gamma=gamma;
286 }
287 }
288 if(info)
289 {
290 dfile.change_info(info);
291 }
292 }
293
294 void
ChangeAnno(const int width,const int height,DjVuFile & dfile,const lt_XMLTags & map)295 lt_XMLParser::Impl::ChangeAnno(
296 const int width, const int height,
297 DjVuFile &dfile,
298 const lt_XMLTags &map )
299 {
300 dfile.resume_decode(true);
301 const GP<DjVuInfo> info(dfile.info);
302 const GP<DjVuAnno> ganno(DjVuAnno::create());
303 DjVuAnno &anno=*ganno;
304 GPosition map_pos;
305 map_pos=map.contains(areatag);
306 if(dfile.contains_anno())
307 {
308 GP<ByteStream> annobs=dfile.get_merged_anno();
309 if(annobs)
310 {
311 anno.decode(annobs);
312 if(anno.ant && info)
313 {
314 anno.ant->map_areas.empty();
315 }
316 }
317 // dfile.remove_anno();
318 }
319 if(info && map_pos)
320 {
321 const int h=info->height;
322 const int w=info->width;
323 double ws=1.0;
324 double hs=1.0;
325 if(width && width != w)
326 {
327 ws=((double)w)/((double)width);
328 }
329 if(height && height != h)
330 {
331 hs=((double)h)/((double)height);
332 }
333 if(!anno.ant)
334 {
335 anno.ant=DjVuANT::create();
336 }
337 GPList<GMapArea> &map_areas=anno.ant->map_areas;
338 map_areas.empty();
339 GPList<lt_XMLTags> gareas=map[map_pos];
340 for(GPosition pos=gareas;pos;++pos)
341 {
342 if(gareas[pos])
343 {
344 lt_XMLTags &areas=*(gareas[pos]);
345 GMap<GUTF8String,GUTF8String> args(areas.get_args());
346 GList<int> coords;
347 // ******************************************************
348 // Parse the coords attribute: first read the raw data into
349 // a list, then scale the x, y data into another list. For
350 // circles, you also get a radius element with (looks like an x
351 // with no matching y).
352 // ******************************************************
353 {
354 GPosition coords_pos=args.contains("coords");
355 if(coords_pos)
356 {
357 GList<int> raw_coords;
358 intList(args[coords_pos],raw_coords);
359 for(GPosition raw_pos=raw_coords;raw_pos;++raw_pos)
360 {
361 const int r=raw_coords[raw_pos];
362 const int x=(int)(ws*(double)r+0.5);
363 coords.append(x);
364 int y=h-1;
365 if(! ++raw_pos)
366 {
367 y-=(int)(hs*(double)r+0.5);
368 }else
369 {
370 y-=(int)(hs*(double)raw_coords[raw_pos]+0.5);
371 }
372 coords.append(y);
373 // DjVuPrintMessage("Coords (%d,%d)\n",x,y);
374 }
375 }
376 }
377 GUTF8String shape;
378 {
379 GPosition shape_pos=args.contains("shape");
380 if(shape_pos)
381 {
382 shape=args[shape_pos];
383 }
384 }
385 GP<GMapArea> a;
386 if(shape == "default")
387 {
388 GRect rect(0,0,w,h);
389 a=GMapRect::create(rect);
390 }else if(!shape.length() || shape == "rect")
391 {
392 int xx[4];
393 int i=0;
394 for(GPosition rect_pos=coords;(rect_pos)&&(i<4);++rect_pos,++i)
395 {
396 xx[i]=coords[rect_pos];
397 }
398 if(i!=4)
399 {
400 G_THROW( ERR_MSG("XMLAnno.bad_rect") );
401 }
402 int xmin,xmax;
403 if(xx[0]>xx[2])
404 {
405 xmax=xx[0];
406 xmin=xx[2];
407 }else
408 {
409 xmin=xx[0];
410 xmax=xx[2];
411 }
412 int ymin,ymax;
413 if(xx[1]>xx[3])
414 {
415 ymax=xx[1];
416 ymin=xx[3];
417 }else
418 {
419 ymin=xx[1];
420 ymax=xx[3];
421 }
422 GRect rect(xmin,ymin,xmax-xmin,ymax-ymin);
423 a=GMapRect::create(rect);
424 }else if(shape == "circle")
425 {
426 int xx[4];
427 int i=0;
428 GPosition rect_pos=coords.lastpos();
429 if(rect_pos)
430 {
431 coords.append(coords[rect_pos]);
432 for(rect_pos=coords;(rect_pos)&&(i<4);++rect_pos)
433 {
434 xx[i++]=coords[rect_pos];
435 }
436 }
437 if(i!=4)
438 {
439 G_THROW( ERR_MSG("XMLAnno.bad_circle") );
440 }
441 int x=xx[0],y=xx[1],rx=xx[2],ry=(h-xx[3])-1;
442 GRect rect(x-rx,y-ry,2*rx,2*ry);
443 a=GMapOval::create(rect);
444 }else if(shape == "oval")
445 {
446 int xx[4];
447 int i=0;
448 for(GPosition rect_pos=coords;(rect_pos)&&(i<4);++rect_pos,++i)
449 {
450 xx[i]=coords[rect_pos];
451 }
452 if(i!=4)
453 {
454 G_THROW( ERR_MSG("XMLAnno.bad_oval") );
455 }
456 int xmin,xmax;
457 if(xx[0]>xx[2])
458 {
459 xmax=xx[0];
460 xmin=xx[2];
461 }else
462 {
463 xmin=xx[0];
464 xmax=xx[2];
465 }
466 int ymin,ymax;
467 if(xx[1]>xx[3])
468 {
469 ymax=xx[1];
470 ymin=xx[3];
471 }else
472 {
473 ymin=xx[1];
474 ymax=xx[3];
475 }
476 GRect rect(xmin,ymin,xmax-xmin,ymax-ymin);
477 a=GMapOval::create(rect);
478 }else if(shape == "poly")
479 {
480 GP<GMapPoly> p=GMapPoly::create();
481 for(GPosition poly_pos=coords;poly_pos;++poly_pos)
482 {
483 int x=coords[poly_pos];
484 if(! ++poly_pos)
485 break;
486 int y=coords[poly_pos];
487 p->add_vertex(x,y);
488 }
489 p->close_poly();
490 a=p;
491 }else
492 {
493 G_THROW( ( ERR_MSG("XMLAnno.unknown_shape") "\t")+shape );
494 }
495 if(a)
496 {
497 GPosition pos;
498 if((pos=args.contains("href")))
499 {
500 a->url=args[pos];
501 }
502 if((pos=args.contains("target")))
503 {
504 a->target=args[pos];
505 }
506 if((pos=args.contains("alt")))
507 {
508 a->comment=args[pos];
509 }
510 if((pos=args.contains("bordertype")))
511 {
512 GUTF8String b=args[pos];
513 static const GMap<GUTF8String,GMapArea::BorderType> typeMap=BorderTypeMap();
514 if((pos=typeMap.contains(b)))
515 {
516 a->border_type=typeMap[pos];
517 }else
518 {
519 G_THROW( (ERR_MSG("XMLAnno.unknown_border") "\t")+b );
520 }
521 }
522 a->border_always_visible=!!args.contains("visible");
523 if((pos=args.contains("bordercolor")))
524 {
525 a->border_color=convertToColor(args[pos]);
526 }
527 if((pos=args.contains("highlight")))
528 {
529 a->hilite_color=convertToColor(args[pos]);
530 }
531 if((pos=args.contains("border")))
532 {
533 a->border_width=args[pos].toInt(); //atoi(args[pos]);
534 }
535 map_areas.append(a);
536 }
537 }
538 }
539 }
540 dfile.set_modified(true);
541 dfile.anno=ByteStream::create();
542 anno.encode(dfile.anno);
543 }
544
545 GP<DjVuFile>
get_file(const GURL & url,GUTF8String id)546 lt_XMLParser::Impl::get_file(const GURL &url,GUTF8String id)
547 {
548 GP<DjVuFile> dfile;
549 GP<DjVuDocument> doc;
550 GCriticalSectionLock lock(&xmlparser_lock);
551 {
552 GPosition pos=m_docs.contains(url.get_string());
553 if(pos)
554 {
555 doc=m_docs[pos];
556 }else
557 {
558 doc=DjVuDocument::create_wait(url);
559 if(! doc->wait_for_complete_init())
560 {
561 G_THROW(( ERR_MSG("XMLAnno.fail_init") "\t")+url.get_string() );
562 }
563 m_docs[url.get_string()]=doc;
564 }
565 if(id.is_int())
566 {
567 const int xpage=id.toInt(); //atoi((char const *)page);
568 if(xpage>0)
569 id=doc->page_to_id(xpage-1);
570 }else if(!id.length())
571 {
572 id=doc->page_to_id(0);
573 }
574 }
575 const GURL fileurl(doc->id_to_url(id));
576 GPosition dpos(m_files.contains(fileurl.get_string()));
577 if(!dpos)
578 {
579 if(!doc->get_id_list().contains(id))
580 {
581 G_THROW( ERR_MSG("XMLAnno.bad_page") );
582 }
583 dfile=doc->get_djvu_file(id,false);
584 if(!dfile)
585 {
586 G_THROW( ERR_MSG("XMLAnno.bad_page") );
587 }
588 m_files[fileurl.get_string()]=dfile;
589 }else
590 {
591 dfile=m_files[dpos];
592 }
593 return dfile;
594 }
595
596 void
parse(const lt_XMLTags & tags,GURL * pdjvufile)597 lt_XMLParser::Impl::parse(const lt_XMLTags &tags, GURL *pdjvufile)
598 {
599 const GPList<lt_XMLTags> Body(tags.get_Tags(bodytag));
600 GPosition pos=Body;
601
602 if(!pos || (pos != Body.lastpos()))
603 {
604 G_THROW( ERR_MSG("XMLAnno.extra_body") );
605 }
606 const GP<lt_XMLTags> GBody(Body[pos]);
607 if(!GBody)
608 {
609 G_THROW( ERR_MSG("XMLAnno.no_body") );
610 }
611
612 GMap<GUTF8String,GP<lt_XMLTags> > Maps;
613 lt_XMLTags::get_Maps(maptag,"name",Body,Maps);
614
615 const GPList<lt_XMLTags> Objects(GBody->get_Tags(objecttag));
616 lt_XMLTags::get_Maps(maptag,"name",Objects,Maps);
617
618 for(GPosition Objpos=Objects;Objpos;++Objpos)
619 {
620 lt_XMLTags &GObject=*Objects[Objpos];
621 // Map of attributes to value (e.g. "width" --> "500")
622 const GMap<GUTF8String,GUTF8String> &args=GObject.get_args();
623 GURL codebase;
624 {
625 DEBUG_MSG("Setting up codebase... m_codebase = " << m_codebase << "\n");
626 GPosition codebasePos=args.contains("codebase");
627 // If user specified a codebase attribute, assume it is correct (absolute URL):
628 // the GURL constructor will throw an exception if it isn't
629 if(codebasePos)
630 {
631 codebase=GURL::UTF8(args[codebasePos]);
632 }else if (m_codebase.is_dir())
633 {
634 codebase=m_codebase;
635 }else
636 {
637 codebase=GURL::Filename::UTF8(GOS::cwd());
638 }
639 DEBUG_MSG("codebase = " << codebase << "\n");
640 }
641 // the data attribute specifies the input file. This can be
642 // either an absolute URL (starts with file:/) or a relative
643 // URL (for now, just a path and file name). If it's absolute,
644 // our GURL will adequately wrap it. If it's relative, we need
645 // to use the codebase attribute to form an absolute URL first.
646 GPosition datapos=args.contains("data");
647 if(datapos)
648 {
649 GPosition typePos(args.contains("type"));
650 if(typePos)
651 {
652 if(args[typePos] != mimetype)
653 continue;
654 }
655 const GURL url = (pdjvufile) ? *pdjvufile
656 : GURL::UTF8(args[datapos],
657 (args[datapos][0] == '/') ? codebase.base() : codebase);
658 int width;
659 {
660 GPosition widthPos=args.contains("width");
661 width=(widthPos)?args[widthPos].toInt():0;
662 }
663 int height;
664 {
665 GPosition heightPos=args.contains("height");
666 height=(heightPos)?args[heightPos].toInt():0;
667 }
668 GUTF8String gamma;
669 GUTF8String dpi;
670 GUTF8String page;
671 GUTF8String do_ocr;
672 {
673 GPosition paramPos(GObject.contains(paramtag));
674 if(paramPos)
675 {
676 const GPList<lt_XMLTags> Params(GObject[paramPos]);
677 for(GPosition loc=Params;loc;++loc)
678 {
679 const GMap<GUTF8String,GUTF8String> &pargs=Params[loc]->get_args();
680 GPosition namepos=pargs.contains("name");
681 if(namepos)
682 {
683 GPosition valuepos=pargs.contains("value");
684 if(valuepos)
685 {
686 const GUTF8String name=pargs[namepos].downcase();
687 const GUTF8String &value=pargs[valuepos];
688 if(name == "flags")
689 {
690 GMap<GUTF8String,GUTF8String> args;
691 lt_XMLTags::ParseValues(value,args,true);
692 if(args.contains("page"))
693 {
694 page=args["page"];
695 }
696 if(args.contains("dpi"))
697 {
698 dpi=args["dpi"];
699 }
700 if(args.contains("gamma"))
701 {
702 gamma=args["gamma"];
703 }
704 if(args.contains("ocr"))
705 {
706 do_ocr=args["ocr"];
707 }
708 }else if(name == "page")
709 {
710 page=value;
711 }else if(name == "dpi")
712 {
713 dpi=value;
714 }else if(name == "gamma")
715 {
716 gamma=value;
717 }else if(name == "ocr")
718 {
719 do_ocr=value;
720 }
721 }
722 }
723 }
724 }
725 }
726 const GP<DjVuFile> dfile(get_file(url,page));
727 if(dpi.is_int() || gamma.is_float())
728 {
729 int pos=0;
730 ChangeInfo(*dfile,dpi.toInt(),gamma.toDouble(pos,pos));
731 }
732 parse_anno(width,height,GObject,Maps,*dfile);
733 parse_meta(GObject,*dfile);
734 parse_text(width,height,GObject,*dfile);
735 ChangeTextOCR(do_ocr,width,height,dfile);
736 }
737 }
738 }
739
740 void
parse_anno(const int width,const int height,const lt_XMLTags & GObject,GMap<GUTF8String,GP<lt_XMLTags>> & Maps,DjVuFile & dfile)741 lt_XMLParser::Impl::parse_anno(
742 const int width,
743 const int height,
744 const lt_XMLTags &GObject,
745 GMap<GUTF8String,GP<lt_XMLTags> > &Maps,
746 DjVuFile &dfile )
747 {
748 GP<lt_XMLTags> map;
749 {
750 GPosition usemappos=GObject.get_args().contains("usemap");
751 if(usemappos)
752 {
753 const GUTF8String mapname(GObject.get_args()[usemappos]);
754 GPosition mappos=Maps.contains(mapname);
755 if(!mappos)
756 {
757 G_THROW((ERR_MSG("XMLAnno.map_find") "\t")+mapname );
758 }else
759 {
760 map=Maps[mappos];
761 }
762 }
763 }
764 if(map)
765 {
766 ChangeAnno(width,height,dfile,*map);
767 }
768 }
769
770 #ifdef max
771 #undef max
772 #endif
773 template<class TYPE>
max(TYPE a,TYPE b)774 static inline TYPE max(TYPE a,TYPE b) { return (a>b)?a:b; }
775 #ifdef min
776 #undef min
777 #endif
778 template<class TYPE>
min(TYPE a,TYPE b)779 static inline TYPE min(TYPE a,TYPE b) { return (a<b)?a:b; }
780
781 // used to build the zone tree
782 // true is returned if the GRect is known for this object,
783 // and false, if the rectangle's size is just the parent size.
784 static bool
make_child_layer(DjVuTXT::Zone & parent,const lt_XMLTags & tag,ByteStream & bs,const int height,const double ws,const double hs)785 make_child_layer(
786 DjVuTXT::Zone &parent,
787 const lt_XMLTags &tag, ByteStream &bs,
788 const int height, const double ws, const double hs)
789 {
790 bool retval=true;
791 // the plugin thinks there are only Pages, Lines and Words
792 // so we don't make Paragraphs, Regions and Columns zones
793 // if we did the plugin is not able to search the text but
794 // DjVuToText writes out all the text anyway
795 DjVuTXT::Zone *self_ptr;
796 char sepchar;
797 const GUTF8String name(tag.get_name());
798 if(name == charactertag)
799 {
800 self_ptr=parent.append_child();
801 self_ptr->ztype = DjVuTXT::CHARACTER;
802 sepchar=0;
803 }else if(name == wordtag)
804 {
805 self_ptr=parent.append_child();
806 self_ptr->ztype = DjVuTXT::WORD;
807 sepchar=' ';
808 }else if(name == linetag)
809 {
810 self_ptr=parent.append_child();
811 self_ptr->ztype = DjVuTXT::LINE;
812 sepchar=DjVuTXT::end_of_line;
813 }else if(name == paragraphtag)
814 {
815 self_ptr=parent.append_child();
816 self_ptr->ztype = DjVuTXT::PARAGRAPH;
817 sepchar=DjVuTXT::end_of_paragraph;
818 }else if(name == regiontag)
819 {
820 self_ptr=parent.append_child();
821 self_ptr->ztype = DjVuTXT::REGION;
822 sepchar=DjVuTXT::end_of_region;
823 }else if(name == pagecolumntag)
824 {
825 self_ptr=parent.append_child();
826 self_ptr->ztype = DjVuTXT::COLUMN;
827 sepchar=DjVuTXT::end_of_column;
828 }else
829 {
830 self_ptr = &parent;
831 self_ptr->ztype = DjVuTXT::PAGE;
832 sepchar=0;
833 }
834 DjVuTXT::Zone &self = *self_ptr;
835 self.text_start = bs.tell();
836 int &xmin=self.rect.xmin, &ymin=self.rect.ymin,
837 &xmax=self.rect.xmax, &ymax=self.rect.ymax;
838 GRect default_rect;
839 default_rect.xmin=max(parent.rect.xmax,parent.rect.xmin);
840 default_rect.xmax=min(parent.rect.xmax,parent.rect.xmin);
841 default_rect.ymin=max(parent.rect.ymax,parent.rect.ymin);
842 default_rect.ymax=min(parent.rect.ymax,parent.rect.ymin);
843 // Now if there are coordinates, use those.
844 GPosition pos(tag.get_args().contains("coords"));
845 if(pos)
846 {
847 GList<int> rectArgs;
848 intList(tag.get_args()[pos], rectArgs);
849 if((pos=rectArgs))
850 {
851 xmin=(int)(ws*(double)rectArgs[pos]);
852 if(++pos)
853 {
854 ymin=(height-1)-(int)(hs*(double)rectArgs[pos]);
855 if(++pos)
856 {
857 xmax=(int)(ws*(double)rectArgs[pos]);
858 if(++pos)
859 {
860 ymax=(height-1)-(int)(hs*(double)rectArgs[pos]);
861 if(xmin>xmax) // Make sure xmin is really minimum
862 {
863 const int t=xmin;
864 xmin=xmax;
865 xmax=t;
866 }
867 if(ymin>ymax) // Make sure ymin is really minimum
868 {
869 const int t=ymin;
870 ymin=ymax;
871 ymax=t;
872 }
873 }
874 }
875 }
876 }
877 }
878 if(self.ztype == DjVuTXT::CHARACTER)
879 {
880 if(! pos)
881 {
882 self.rect=default_rect;
883 retval=false;
884 }
885 const GUTF8String raw(tag.get_raw().fromEscaped());
886 const int i=raw.nextNonSpace(0);
887 bs.writestring(raw.substr(i,raw.firstEndSpace(i)-i));
888 if(sepchar)
889 bs.write8(sepchar);
890 self.text_length = bs.tell() - self.text_start;
891 }else if(pos)
892 {
893 pos=tag.get_content();
894 if(pos)
895 {
896 for(pos=tag.get_content(); pos; ++pos)
897 {
898 const GP<lt_XMLTags> t(tag.get_content()[pos].tag);
899 make_child_layer(self, *t, bs, height,ws,hs);
900 }
901 if(sepchar)
902 bs.write8(sepchar);
903 self.text_length = bs.tell() - self.text_start;
904 }else
905 {
906 const GUTF8String raw(tag.get_raw().fromEscaped());
907 const int i=raw.nextNonSpace(0);
908 bs.writestring(raw.substr(i,raw.firstEndSpace(i)-i));
909 if(sepchar)
910 bs.write8(sepchar);
911 self.text_length = bs.tell() - self.text_start;
912 }
913 }else
914 {
915 self.rect=default_rect;
916 if((pos=tag.get_content()))
917 {
918 do
919 {
920 const GP<lt_XMLTags> t(tag.get_content()[pos].tag);
921 const GRect save_rect(self.rect);
922 self.rect=default_rect;
923 if ((retval = make_child_layer(self, *t, bs, height, ws, hs)))
924 {
925 xmin=min(save_rect.xmin,xmin);
926 xmax=max(save_rect.xmax,xmax);
927 ymin=min(save_rect.ymin,ymin);
928 ymax=max(save_rect.ymax,ymax);
929 }else
930 {
931 // If the child doesn't have coordinates, we need to use a box
932 // at least as big as the parent's coordinates.
933 xmin=min(save_rect.xmin,default_rect.xmax);
934 xmax=max(save_rect.xmax,default_rect.xmin);
935 ymin=min(save_rect.ymin,default_rect.ymax);
936 ymax=max(save_rect.ymax,default_rect.ymin);
937 for(; pos; ++pos)
938 {
939 const GP<lt_XMLTags> t(tag.get_content()[pos].tag);
940 make_child_layer(self, *t, bs, height,ws,hs);
941 }
942 break;
943 }
944 } while(++pos);
945 if(sepchar)
946 bs.write8(sepchar);
947 self.text_length = bs.tell() - self.text_start;
948 }else
949 {
950 const GUTF8String raw(tag.get_raw().fromEscaped());
951 const int i=raw.nextNonSpace(0);
952 bs.writestring(raw.substr(i,raw.firstEndSpace(i)-i));
953 if(sepchar)
954 bs.write8(sepchar);
955 self.text_length = bs.tell() - self.text_start;
956 }
957 }
958 parent.rect.xmin=min(xmin,parent.rect.xmin);
959 parent.rect.ymin=min(ymin,parent.rect.ymin);
960 parent.rect.xmax=max(xmax,parent.rect.xmax);
961 parent.rect.ymax=max(ymax,parent.rect.ymax);
962 if(xmin>xmax)
963 {
964 const int t=xmin;
965 xmin=xmax;
966 xmax=t;
967 }
968 if(ymin>ymax)
969 {
970 const int t=ymin;
971 ymin=ymax;
972 ymax=t;
973 }
974 // DjVuPrintMessage("(%d,%d)(%d,%d)<<<\\%o>>>\n",
975 // xmin,ymin,xmax,ymax, sepchar);
976 return retval;
977 }
978
979 void
ChangeTextOCR(const GUTF8String & value,const int width,const int height,const GP<DjVuFile> & dfile)980 lt_XMLParser::Impl::ChangeTextOCR(
981 const GUTF8String &value,
982 const int width,
983 const int height,
984 const GP<DjVuFile> &dfile)
985 {
986 if(value.length() && value.downcase() != "false")
987 {
988 const GP<ByteStream> bs=OCRcallback(value,DjVuImage::create(dfile));
989 if( bs && bs->size() )
990 {
991 const GP<lt_XMLTags> tags(lt_XMLTags::create(bs));
992 ChangeText(width,height,*dfile,*tags);
993 }
994 }
995 }
996
997 void
ChangeMeta(DjVuFile & dfile,const lt_XMLTags & tags)998 lt_XMLParser::Impl::ChangeMeta(
999 DjVuFile &dfile, const lt_XMLTags &tags )
1000 {
1001 dfile.resume_decode(true);
1002 GP<ByteStream> gbs(ByteStream::create());
1003 tags.write(*gbs,false);
1004 gbs->seek(0L);
1005 GUTF8String raw(gbs->getAsUTF8());
1006 if(raw.length())
1007 {
1008 //GUTF8String gs="<"+(metadatatag+(">"+raw))+"</"+metadatatag+">\n");
1009 dfile.change_meta(raw+"\n");
1010 }else
1011 {
1012 dfile.change_meta(GUTF8String());
1013 }
1014 }
1015
1016 void
ChangeText(const int width,const int height,DjVuFile & dfile,const lt_XMLTags & tags)1017 lt_XMLParser::Impl::ChangeText(
1018 const int width, const int height,
1019 DjVuFile &dfile, const lt_XMLTags &tags )
1020 {
1021 dfile.resume_decode(true);
1022
1023 GP<DjVuText> text = DjVuText::create();
1024 GP<DjVuTXT> txt = text->txt = DjVuTXT::create();
1025
1026 // to store the new text
1027 GP<ByteStream> textbs = ByteStream::create();
1028
1029 GP<DjVuInfo> info=(dfile.info);
1030 if(info)
1031 {
1032 const int h=info->height;
1033 const int w=info->width;
1034 txt->page_zone.text_start = 0;
1035 DjVuTXT::Zone &parent=txt->page_zone;
1036 parent.rect.xmin=0;
1037 parent.rect.ymin=0;
1038 parent.rect.ymax=h;
1039 parent.rect.xmax=w;
1040 double ws=1.0;
1041 if(width && width != w)
1042 {
1043 ws=((double)w)/((double)width);
1044 }
1045 double hs=1.0;
1046 if(height && height != h)
1047 {
1048 hs=((double)h)/((double)height);
1049 }
1050 make_child_layer(parent, tags, *textbs, h, ws,hs);
1051 textbs->write8(0);
1052 long len = textbs->tell();
1053 txt->page_zone.text_length = len;
1054 textbs->seek(0,SEEK_SET);
1055 textbs->read(txt->textUTF8.getbuf(len), len);
1056
1057 dfile.change_text(txt,false);
1058 }
1059 }
1060
1061 void
parse_text(const int width,const int height,const lt_XMLTags & GObject,DjVuFile & dfile)1062 lt_XMLParser::Impl::parse_text(
1063 const int width,
1064 const int height,
1065 const lt_XMLTags &GObject,
1066 DjVuFile &dfile )
1067 {
1068 GPosition textPos = GObject.contains(hiddentexttag);
1069 if(textPos)
1070 {
1071 // loop through the hidden text - there should only be one
1072 // if there are more ??only the last one will be saved??
1073 GPList<lt_XMLTags> textTags = GObject[textPos];
1074 GPosition pos = textTags;
1075 ChangeText(width,height,dfile,*textTags[pos]);
1076 }
1077 }
1078
1079 void
parse_meta(const lt_XMLTags & GObject,DjVuFile & dfile)1080 lt_XMLParser::Impl::parse_meta(
1081 const lt_XMLTags &GObject,
1082 DjVuFile &dfile )
1083 {
1084 GPosition metaPos = GObject.contains(metadatatag);
1085 if(metaPos)
1086 {
1087 // loop through the hidden text - there should only be one
1088 // if there are more ??only the last one will be saved??
1089 GPList<lt_XMLTags> metaTags = GObject[metaPos];
1090 GPosition pos = metaTags;
1091 ChangeMeta(dfile,*metaTags[pos]);
1092 }
1093 }
1094
1095 static GP<ByteStream>
OCRcallback(void * const xarg,lt_XMLParser::mapOCRcallback * const xcallback,const GUTF8String & value,const GP<DjVuImage> & image)1096 OCRcallback(
1097 void * const xarg,
1098 lt_XMLParser::mapOCRcallback * const xcallback,
1099 const GUTF8String &value,
1100 const GP<DjVuImage> &image )
1101 {
1102 GP<ByteStream> retval;
1103 static void *arg=0;
1104 static lt_XMLParser::mapOCRcallback *callback=0;
1105 if(image)
1106 {
1107 if(callback)
1108 retval=callback(arg,value,image);
1109 }else
1110 {
1111 arg=xarg;
1112 callback=xcallback;
1113 }
1114 return retval;
1115 }
1116
1117 void
setOCRcallback(void * const arg,mapOCRcallback * const callback)1118 lt_XMLParser::setOCRcallback(
1119 void * const arg,
1120 mapOCRcallback * const callback)
1121 {
1122 ::OCRcallback(arg,callback);
1123 }
1124
1125
1126 #ifdef HAVE_NAMESPACES
1127 }
1128 # ifndef NOT_USING_DJVU_NAMESPACE
1129 using namespace DJVU;
1130 # endif
1131 #endif
1132