1 //C-  -*- C++ -*-
2 //C- -------------------------------------------------------------------
3 //C- DjVuLibre-3.5
4 //C- Copyright (c) 2002  Leon Bottou and Yann Le Cun.
5 //C- Copyright (c) 2001  AT&T
6 //C-
7 //C- This software is subject to, and may be distributed under, the
8 //C- GNU General Public License, either Version 2 of the license,
9 //C- or (at your option) any later version. The license should have
10 //C- accompanied the software or you may obtain a copy of the license
11 //C- from the Free Software Foundation at http://www.fsf.org .
12 //C-
13 //C- This program is distributed in the hope that it will be useful,
14 //C- but WITHOUT ANY WARRANTY; without even the implied warranty of
15 //C- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 //C- GNU General Public License for more details.
17 //C-
18 //C- DjVuLibre-3.5 is derived from the DjVu(r) Reference Library from
19 //C- Lizardtech Software.  Lizardtech Software has authorized us to
20 //C- replace the original DjVu(r) Reference Library notice by the following
21 //C- text (see doc/lizard2002.djvu and doc/lizardtech2007.djvu):
22 //C-
23 //C-  ------------------------------------------------------------------
24 //C- | DjVu (r) Reference Library (v. 3.5)
25 //C- | Copyright (c) 1999-2001 LizardTech, Inc. All Rights Reserved.
26 //C- | The DjVu Reference Library is protected by U.S. Pat. No.
27 //C- | 6,058,214 and patents pending.
28 //C- |
29 //C- | This software is subject to, and may be distributed under, the
30 //C- | GNU General Public License, either Version 2 of the license,
31 //C- | or (at your option) any later version. The license should have
32 //C- | accompanied the software or you may obtain a copy of the license
33 //C- | from the Free Software Foundation at http://www.fsf.org .
34 //C- |
35 //C- | The computer code originally released by LizardTech under this
36 //C- | license and unmodified by other parties is deemed "the LIZARDTECH
37 //C- | ORIGINAL CODE."  Subject to any third party intellectual property
38 //C- | claims, LizardTech grants recipient a worldwide, royalty-free,
39 //C- | non-exclusive license to make, use, sell, or otherwise dispose of
40 //C- | the LIZARDTECH ORIGINAL CODE or of programs derived from the
41 //C- | LIZARDTECH ORIGINAL CODE in compliance with the terms of the GNU
42 //C- | General Public License.   This grant only confers the right to
43 //C- | infringe patent claims underlying the LIZARDTECH ORIGINAL CODE to
44 //C- | the extent such infringement is reasonably necessary to enable
45 //C- | recipient to make, have made, practice, sell, or otherwise dispose
46 //C- | of the LIZARDTECH ORIGINAL CODE (or portions thereof) and not to
47 //C- | any greater extent that may be necessary to utilize further
48 //C- | modifications or combinations.
49 //C- |
50 //C- | The LIZARDTECH ORIGINAL CODE is provided "AS IS" WITHOUT WARRANTY
51 //C- | OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
52 //C- | TO ANY WARRANTY OF NON-INFRINGEMENT, OR ANY IMPLIED WARRANTY OF
53 //C- | MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
54 //C- +------------------------------------------------------------------
55 
56 #ifndef _DJVMDIR_H
57 #define _DJVMDIR_H
58 #ifdef HAVE_CONFIG_H
59 #include "config.h"
60 #endif
61 #if NEED_GNUG_PRAGMAS
62 # pragma interface
63 #endif
64 
65 
66 /** @name DjVmDir.h
67     Files #"DjVmDir.h"# and #"DjVmDir.cpp"# implement class \Ref{DjVmDir} for
68     representing the directory of a DjVu multipage document.
69 
70     {\bf Bundled vs. Indirect format} --- There are currently two multipage
71     DjVu formats supported: {\em bundled} and {\em indirect}.  In the first
72     format all component files composing a given document are packaged (or
73     bundled) into one file, in the second one every page and component is
74     stored in a separate file and there is one more file, which contains the
75     list of all others.
76 
77     {\bf Multipage DjVu format} --- Multipage DjVu documents follow the EA
78     IFF85 format (cf. \Ref{IFFByteStream.h}.)  A document is composed of a
79     #"FORM:DJVM"# whose first chunk is a #"DIRM"# chunk containing the {\em
80     document directory}.  This directory lists all component files composing
81     the given document, helps to access every component file and identify the
82     pages of the document.
83     \begin{itemize}
84     \item In a {\em bundled} multipage file, the component files
85          are stored immediately after the #"DIRM"# chunk,
86          within the #"FORM:DJVU"# composite chunk.
87     \item In an {\em indirect} multipage file, the component files are
88           stored in different files whose URLs are composed using information
89           stored in the #"DIRM"# chunk.
90     \end{itemize}
91     Most of the component files represent pages of a document.  Some files
92     however represent data shared by several pages.  The pages refer to these
93     supporting files by means of an inclusion chunk (#"INCL"# chunks)
94     identifying the supporting file.
95 
96     {\bf Document Directory} --- Every directory record describes a component
97     file.  Each component file is identified by a small string named the
98     identifier (ID).  Each component file also contains a file name and a
99     title.  The format of the #"DIRM"# chunk is described in section
100     \Ref{Format of the DIRM chunk.}.
101 
102     Theoretically, IDs are used to uniquely identify each component file in
103     #"INCL"# chunks, names are used to compose the the URLs of the component
104     files in an indirect multipage DjVu file, and titles are cosmetic names
105     possibly displayed when viewing a page of a document.  There are however
106     many problems with this scheme, and we {\em strongly suggest}, with the
107     current implementation to always make the file ID, the file name and the
108     file title identical.
109 
110     @memo Implements DjVu multipage document directory
111     @author Andrei Erofeev <eaf@geocities.com>
112 */
113 //@{
114 
115 
116 
117 #include "GString.h"
118 #include "GThreads.h"
119 
120 #ifdef HAVE_NAMESPACES
121 namespace DJVU {
122 # ifdef NOT_DEFINED // Just to fool emacs c++ mode
123 }
124 #endif
125 #endif
126 
127 class ByteStream;
128 
129 /** Implements DjVu multipage document directory.  There are currently two
130     multipage DjVu formats supported: {\em bundled} and {\em indirect}.  In
131     the first format all component files composing a given document are
132     packaged (or bundled) into one file, in the second one every page and
133     component is stored in a separate file and there is one more file, which
134     contains the list of all others.
135 
136     The multipage document directory lists all component files composing the
137     given document, helps to access every file, identify pages and maintain
138     user-specified shortcuts.  Every directory record describes a file
139     composing the document.  Each file is identified by a small string named
140     the identifier (ID).  Each file may also contain a file name and a title.
141 
142     The #DjVmDir# class represents a multipage document directory.  Its main
143     purpose is to encode and decode the document directory when writing or
144     reading the #DIRM# chunk.  Normally you don't have to create this class
145     yourself. It's done automatically when \Ref{DjVmDoc} class initializes
146     itself. It may be useful though to be able to access records in the
147     directory because some classes (like \Ref{DjVuDocument} and \Ref{DjVmDoc})
148     return a pointer to #DjVmDir# in some cases. */
149 
150 class DJVUAPI DjVmDir : public GPEnabled
151 {
152 protected:
153       /** Class \Ref{DjVmDir::File} represents the directory records
154           managed by class \Ref{DjVmDir}. */
DjVmDir(void)155    DjVmDir(void) { } ;
156 public:
157    class File;
158 
159    static const int version;
160 
161       /** Class \Ref{DjVmDir::File} represents the directory records
162           managed by class \Ref{DjVmDir}. */
create(void)163    static GP<DjVmDir> create(void) {return new DjVmDir; } ;
164 
165       /** Decodes the directory from the specified stream. */
166    void decode(const GP<ByteStream> &stream);
167       /** Encodes the directory into the specified stream. */
168    void encode(const GP<ByteStream> &stream, const bool do_rename=false) const;
169       /** Encodes the directory into the specified stream,
170           explicitely as bundled or indirect. */
171   void encode(const GP<ByteStream> &stream,
172               const bool bundled, const bool do_rename) const;
173       /** Tests if directory defines an {\em indirect} document. */
174    inline bool is_indirect(void) const;
175       /** Tests if the directory defines a {\em bundled} document. */
176    inline bool is_bundled(void) const;
177       /** Translates page numbers to file records. */
178    GP<File> page_to_file(int page_num) const;
179       /** Translates file names to file records. */
180    GP<File> name_to_file(const GUTF8String & name) const;
181       /** Translates file IDs to file records. */
182    GP<File> id_to_file(const GUTF8String &id) const;
183       /** Translates file shortcuts to file records. */
184    GP<File> title_to_file(const GUTF8String &title, GPosition spos) const;
185    GP<File> title_to_file(const GUTF8String &title) const;
186       /** Access file record by position. */
187    GP<File> pos_to_file(int fileno, int *ppageno=0) const;
188       /** Returns position of the file in the directory. */
189    int get_file_pos(const File * f) const;
190       /** Returns position of the given page in the directory. */
191    int get_page_pos(int page_num) const;
192       /** Check for duplicate names, and resolve them. */
193    GPList<File> resolve_duplicates(const bool save_as_bundled);
194       /** Returns a copy of the list of file records. */
195    GPList<File> get_files_list(void) const;
196       /** Returns the number of file records. */
197    int get_files_num(void) const;
198       /** Returns the number of file records representing pages. */
199    int get_pages_num(void) const;
200       /** Returns back pointer to the file with #SHARED_ANNO# flag.
201         Note that there may be only one file with shared annotations
202         in any multipage DjVu document. */
203    GP<File> get_shared_anno_file(void) const;
204       /** Changes the title of the file with ID #id#. */
205    void set_file_title(const GUTF8String &id, const GUTF8String &title);
206       /** Changes the name of the file with ID #id#. */
207    void set_file_name(const GUTF8String &id, const GUTF8String &name);
208       /** Inserts the specified file record at the specified position.
209         Specifying #pos# equal to #-1# means to append.  The actual position
210         inserted is returned. */
211    int insert_file(const GP<File> & file, int pos=-1);
212       /** Removes a file record with ID #id#. */
213    void delete_file(const GUTF8String &id);
214 private:
215    GCriticalSection class_lock;
216    GPList<File>	files_list;
217    GPArray<File> page2file;
218    GPMap<GUTF8String, File> name2file;
219    GPMap<GUTF8String, File> id2file;
220 private: //dummy stuff
221    static void decode(ByteStream *);
222    static void encode(ByteStream *);
223 };
224 
225 class DJVUAPI DjVmDir::File : public GPEnabled
226 {
227 public:
228   // Out of the record: INCLUDE below must be zero and PAGE must be one.
229   // This is to avoid problems with the File constructor, which now takes
230   // 'int file_type' as the last argument instead of 'bool is_page'
231 
232   /** File type. Possible file types are:
233      \begin{description}
234        \item[PAGE] This is a top level page file. It may include other
235          #INCLUDE#d files, which may in turn be shared between
236          different pages.
237        \item[INCLUDE] This file is included into some other file inside
238          this document.
239        \item[THUMBNAILS] This file contains thumbnails for the document
240          pages.
241        \item[SHARED_ANNO] This file contains annotations shared by
242          all the pages. It's supposed to be included into every page
243          for the annotations to take effect. There may be only one
244          file with shared annotations in a document.
245      \end{description} */
246   enum FILE_TYPE { INCLUDE=0, PAGE=1, THUMBNAILS=2, SHARED_ANNO=3 };
247 protected:
248   /** Default constructor. */
249   File(void);
250 
251 public:
create(void)252   static GP<File> create(void) { return new File(); }
253   static GP<File> create(const GUTF8String &load_name,
254      const GUTF8String &save_name, const GUTF8String &title,
255      const FILE_TYPE file_type);
256 
257   /** Check for filenames that are not valid for the native encoding,
258       and change them. */
259   const GUTF8String &check_save_name(const bool as_bundled);
260 
261   /** File name.  The optional file name must be unique and is the name
262       that will be used when the document is saved to an indirect file.
263       If not assigned, the value of #id# will be used for this purpose.
264       By keeping the name in {\em bundled} document we guarantee, that it
265       can be expanded later into {\em indirect} document and files will
266       still have the same names, if the name is legal on a given filesystem.
267     */
268   const GUTF8String &get_save_name(void) const;
269 
270   /** File identifier.  The encoder assigns a unique identifier to each file
271       in a multipage document. This is the name used when loading files.
272       Indirection chunks in other files (#"INCL"# chunks) may refer to another
273       file using its identifier. */
274   const GUTF8String &get_load_name(void) const;
275   void set_load_name(const GUTF8String &id);
276 
277   /** File title.  The file title is assigned by the user and may be used as
278       a shortcut for viewing a particular page.  Names like #"chapter1"# or
279       #"appendix"# are appropriate. */
280   const GUTF8String &get_title() const;
281   void set_title(const GUTF8String &id);
282 
283   /** Reports an ascii string indicating file type. */
284   GUTF8String get_str_type(void) const;
285 
286   /** Offset of the file data in a bundled DJVM file.  This number is
287       relevant in the {\em bundled} case only when everything is packed into
288       one single file. */
289   int offset;
290 
291   /** Size of the file data in a bundled DJVM file.  This number is
292       relevant in the {\em bundled} case only when everything is
293       packed into one single file. */
294   int size;
295 
296   /** Have we checked the saved file name, to see if it is valid on the
297       local disk? */
298   bool valid_name;
299 
300   /** Tests if this file represents a page of the document. */
is_page(void)301   bool is_page(void) const
302   {
303     return (flags & TYPE_MASK)==PAGE;
304   }
305 
306   /** Returns #TRUE# if this file is included into some other files of
307       this document. */
is_include(void)308   bool is_include(void) const
309   {
310     return (flags & TYPE_MASK)==INCLUDE;
311   }
312 
313   /** Returns #TRUE# if this file contains thumbnails for the document pages. */
is_thumbnails(void)314   bool is_thumbnails(void) const
315   {
316     return (flags & TYPE_MASK)==THUMBNAILS;
317   }
318 
319   /** Returns the page number of this file. This function returns
320       #-1# if this file does not represent a page of the document. */
is_shared_anno(void)321   bool is_shared_anno(void) const
322   { return (flags & TYPE_MASK)==SHARED_ANNO; }
323 
get_page_num(void)324   int get_page_num(void) const
325   { return page_num; }
326 protected:
327   GUTF8String name;
328   GUTF8String oldname;
329   GUTF8String id;
330   GUTF8String title;
331   void set_save_name(const GUTF8String &name);
332 private:
333       friend class DjVmDir;
334       enum FLAGS_0 { IS_PAGE_0=1, HAS_NAME_0=2, HAS_TITLE_0=4 };
335       enum FLAGS_1 { HAS_NAME=0x80, HAS_TITLE=0x40, TYPE_MASK=0x3f };
336       unsigned char flags;
337       int page_num;
338 };
339 
340 inline const GUTF8String &
get_load_name(void)341 DjVmDir::File::get_load_name(void) const
342 { return id; }
343 
344 inline const GUTF8String &
get_title()345 DjVmDir::File::get_title() const
346 { return *(title.length()?&title:&id); }
347 
348 inline void
set_title(const GUTF8String & xtitle)349 DjVmDir::File::set_title(const GUTF8String &xtitle) { title=xtitle; }
350 
351 /** @name Format of the DIRM chunk.
352 
353     {\bf Variants} --- There are two versions of the #"DIRM"# chunk format.
354     The version number is identified by the seven low bits of the first byte
355     of the chunk.  Version {\bf 0} is obsolete and should never be used.  This
356     section describes version {\bf 1}.  There are two major multipage DjVu
357     formats supported: {\em bundled} and {\em indirect}.  The #"DIRM"# chunk
358     indicates which format is used in the most significant bit of the first
359     byte of the chunk.  The document is bundled when this bit is set.
360     Otherwise the document is indirect.
361 
362     {\bf Unencoded data} --- The #"DIRM"# chunk is composed some unencoded
363     data followed by \Ref{bzz} encoded data.  The unencoded data starts with
364     the version byte and a 16 bit integer representing the number of component
365     files.  All integers are encoded with the most significant byte first.
366     \begin{verbatim}
367           BYTE:             Flags/Version:  0x<bundled>0000011
368           INT16:            Number of component files.
369     \end{verbatim}
370     When the document is a bundled document (i.e. the flag #bundled# is set),
371     this header is followed by the offsets of each of the component files within
372     the #"FORM:DJVM"#.  These offsets allow for random component file access.
373     \begin{verbatim}
374           INT32:            Offset of first component file.
375           INT32:            Offset of second component file.
376           ...
377           INT32:            Offset of last component file.
378     \end{verbatim}
379 
380     {\bf BZZ encoded data} --- The rest of the chunk is entirely compressed
381     with the BZZ general purpose compressor.  We describe now the data fed
382     into (or retrieved from) the BZZ codec (cf. \Ref{BSByteStream}.)  First
383     come the sizes and the flags associated with each component file.
384     \begin{verbatim}
385           INT24:             Size of the first component file.
386           INT24:             Size of the second component file.
387           ...
388           INT24:             Size of the last component file.
389           BYTE:              Flag byte for the first component file.
390           BYTE:              Flag byte for the second component file.
391           ...
392           BYTE:              Flag byte for the last component file.
393     \end{verbatim}
394     The flag bytes have the following format:
395     \begin{verbatim}
396           0b<hasname><hastitle>000000     for a file included by other files.
397           0b<hasname><hastitle>000001     for a file representing a page.
398           0b<hasname><hastitle>000010     for a file containing thumbnails.
399     \end{verbatim}
400     Flag #hasname# is set when the name of the file is different from the file
401     ID.  Flag #hastitle# is set when the title of the file is different from
402     the file ID.  These flags are used to avoid encoding the same string three
403     times.  Then come a sequence of zero terminated strings.  There are one to
404     three such strings per component file.  The first string contains the ID
405     of the component file.  The second string contains the name of the
406     component file.  It is only present when the flag #hasname# is set. The third
407     one contains the title of the component file. It is only present when the
408     flag #hastitle# is set. The \Ref{bzz} encoding system makes sure that
409     all these strings will be encoded efficiently despite their possible
410     redundancies.
411     \begin{verbatim}
412           ZSTR:     ID of the first component file.
413           ZSTR:     Name of the first component file (only if #hasname# is set.)
414           ZSTR:     Title of the first component file (only if #hastitle# is set.)
415           ...
416           ZSTR:     ID of the last component file.
417           ZSTR:     Name of the last component file (only if #hasname# is set.)
418           ZSTR:     Title of the last component file (only if #hastitle# is set.)
419     \end{verbatim}
420 
421     @memo Description of the format of the DIRM chunk.  */
422 //@}
423 
424 
425 
426 // -------------- IMPLEMENTATION
427 
428 
429 inline bool
is_bundled(void)430 DjVmDir::is_bundled(void) const
431 {
432   return ! is_indirect();
433 }
434 
435 inline bool
is_indirect(void)436 DjVmDir::is_indirect(void) const
437 {
438   GCriticalSectionLock lock((GCriticalSection *) &class_lock);
439   return ( files_list.size() && files_list[files_list] != 0 &&
440            files_list[files_list]->offset==0 );
441 }
442 
443 inline GP<DjVmDir::File>
title_to_file(const GUTF8String & title)444 DjVmDir::title_to_file(const GUTF8String &title) const
445 {
446   GPosition pos;
447   return title_to_file(title, pos);
448 }
449 
450 
451 
452 // ----- THE END
453 
454 #ifdef HAVE_NAMESPACES
455 }
456 # ifndef NOT_USING_DJVU_NAMESPACE
457 using namespace DJVU;
458 # endif
459 #endif
460 #endif
461