1 //C- -*- C++ -*-
2 //C- -------------------------------------------------------------------
3 //C- DjVuLibre-3.5
4 //C- Copyright (c) 2002 Leon Bottou and Yann Le Cun.
5 //C- Copyright (c) 2001 AT&T
6 //C-
7 //C- This software is subject to, and may be distributed under, the
8 //C- GNU General Public License, either Version 2 of the license,
9 //C- or (at your option) any later version. The license should have
10 //C- accompanied the software or you may obtain a copy of the license
11 //C- from the Free Software Foundation at http://www.fsf.org .
12 //C-
13 //C- This program is distributed in the hope that it will be useful,
14 //C- but WITHOUT ANY WARRANTY; without even the implied warranty of
15 //C- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 //C- GNU General Public License for more details.
17 //C-
18 //C- DjVuLibre-3.5 is derived from the DjVu(r) Reference Library from
19 //C- Lizardtech Software. Lizardtech Software has authorized us to
20 //C- replace the original DjVu(r) Reference Library notice by the following
21 //C- text (see doc/lizard2002.djvu and doc/lizardtech2007.djvu):
22 //C-
23 //C- ------------------------------------------------------------------
24 //C- | DjVu (r) Reference Library (v. 3.5)
25 //C- | Copyright (c) 1999-2001 LizardTech, Inc. All Rights Reserved.
26 //C- | The DjVu Reference Library is protected by U.S. Pat. No.
27 //C- | 6,058,214 and patents pending.
28 //C- |
29 //C- | This software is subject to, and may be distributed under, the
30 //C- | GNU General Public License, either Version 2 of the license,
31 //C- | or (at your option) any later version. The license should have
32 //C- | accompanied the software or you may obtain a copy of the license
33 //C- | from the Free Software Foundation at http://www.fsf.org .
34 //C- |
35 //C- | The computer code originally released by LizardTech under this
36 //C- | license and unmodified by other parties is deemed "the LIZARDTECH
37 //C- | ORIGINAL CODE." Subject to any third party intellectual property
38 //C- | claims, LizardTech grants recipient a worldwide, royalty-free,
39 //C- | non-exclusive license to make, use, sell, or otherwise dispose of
40 //C- | the LIZARDTECH ORIGINAL CODE or of programs derived from the
41 //C- | LIZARDTECH ORIGINAL CODE in compliance with the terms of the GNU
42 //C- | General Public License. This grant only confers the right to
43 //C- | infringe patent claims underlying the LIZARDTECH ORIGINAL CODE to
44 //C- | the extent such infringement is reasonably necessary to enable
45 //C- | recipient to make, have made, practice, sell, or otherwise dispose
46 //C- | of the LIZARDTECH ORIGINAL CODE (or portions thereof) and not to
47 //C- | any greater extent that may be necessary to utilize further
48 //C- | modifications or combinations.
49 //C- |
50 //C- | The LIZARDTECH ORIGINAL CODE is provided "AS IS" WITHOUT WARRANTY
51 //C- | OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
52 //C- | TO ANY WARRANTY OF NON-INFRINGEMENT, OR ANY IMPLIED WARRANTY OF
53 //C- | MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
54 //C- +------------------------------------------------------------------
55
56 #ifndef _DJVMDIR_H
57 #define _DJVMDIR_H
58 #ifdef HAVE_CONFIG_H
59 #include "config.h"
60 #endif
61 #if NEED_GNUG_PRAGMAS
62 # pragma interface
63 #endif
64
65
66 /** @name DjVmDir.h
67 Files #"DjVmDir.h"# and #"DjVmDir.cpp"# implement class \Ref{DjVmDir} for
68 representing the directory of a DjVu multipage document.
69
70 {\bf Bundled vs. Indirect format} --- There are currently two multipage
71 DjVu formats supported: {\em bundled} and {\em indirect}. In the first
72 format all component files composing a given document are packaged (or
73 bundled) into one file, in the second one every page and component is
74 stored in a separate file and there is one more file, which contains the
75 list of all others.
76
77 {\bf Multipage DjVu format} --- Multipage DjVu documents follow the EA
78 IFF85 format (cf. \Ref{IFFByteStream.h}.) A document is composed of a
79 #"FORM:DJVM"# whose first chunk is a #"DIRM"# chunk containing the {\em
80 document directory}. This directory lists all component files composing
81 the given document, helps to access every component file and identify the
82 pages of the document.
83 \begin{itemize}
84 \item In a {\em bundled} multipage file, the component files
85 are stored immediately after the #"DIRM"# chunk,
86 within the #"FORM:DJVU"# composite chunk.
87 \item In an {\em indirect} multipage file, the component files are
88 stored in different files whose URLs are composed using information
89 stored in the #"DIRM"# chunk.
90 \end{itemize}
91 Most of the component files represent pages of a document. Some files
92 however represent data shared by several pages. The pages refer to these
93 supporting files by means of an inclusion chunk (#"INCL"# chunks)
94 identifying the supporting file.
95
96 {\bf Document Directory} --- Every directory record describes a component
97 file. Each component file is identified by a small string named the
98 identifier (ID). Each component file also contains a file name and a
99 title. The format of the #"DIRM"# chunk is described in section
100 \Ref{Format of the DIRM chunk.}.
101
102 Theoretically, IDs are used to uniquely identify each component file in
103 #"INCL"# chunks, names are used to compose the the URLs of the component
104 files in an indirect multipage DjVu file, and titles are cosmetic names
105 possibly displayed when viewing a page of a document. There are however
106 many problems with this scheme, and we {\em strongly suggest}, with the
107 current implementation to always make the file ID, the file name and the
108 file title identical.
109
110 @memo Implements DjVu multipage document directory
111 @author Andrei Erofeev <eaf@geocities.com>
112 */
113 //@{
114
115
116
117 #include "GString.h"
118 #include "GThreads.h"
119
120 #ifdef HAVE_NAMESPACES
121 namespace DJVU {
122 # ifdef NOT_DEFINED // Just to fool emacs c++ mode
123 }
124 #endif
125 #endif
126
127 class ByteStream;
128
129 /** Implements DjVu multipage document directory. There are currently two
130 multipage DjVu formats supported: {\em bundled} and {\em indirect}. In
131 the first format all component files composing a given document are
132 packaged (or bundled) into one file, in the second one every page and
133 component is stored in a separate file and there is one more file, which
134 contains the list of all others.
135
136 The multipage document directory lists all component files composing the
137 given document, helps to access every file, identify pages and maintain
138 user-specified shortcuts. Every directory record describes a file
139 composing the document. Each file is identified by a small string named
140 the identifier (ID). Each file may also contain a file name and a title.
141
142 The #DjVmDir# class represents a multipage document directory. Its main
143 purpose is to encode and decode the document directory when writing or
144 reading the #DIRM# chunk. Normally you don't have to create this class
145 yourself. It's done automatically when \Ref{DjVmDoc} class initializes
146 itself. It may be useful though to be able to access records in the
147 directory because some classes (like \Ref{DjVuDocument} and \Ref{DjVmDoc})
148 return a pointer to #DjVmDir# in some cases. */
149
150 class DJVUAPI DjVmDir : public GPEnabled
151 {
152 protected:
153 /** Class \Ref{DjVmDir::File} represents the directory records
154 managed by class \Ref{DjVmDir}. */
DjVmDir(void)155 DjVmDir(void) { } ;
156 public:
157 class File;
158
159 static const int version;
160
161 /** Class \Ref{DjVmDir::File} represents the directory records
162 managed by class \Ref{DjVmDir}. */
create(void)163 static GP<DjVmDir> create(void) {return new DjVmDir; } ;
164
165 /** Decodes the directory from the specified stream. */
166 void decode(const GP<ByteStream> &stream);
167 /** Encodes the directory into the specified stream. */
168 void encode(const GP<ByteStream> &stream, const bool do_rename=false) const;
169 /** Encodes the directory into the specified stream,
170 explicitely as bundled or indirect. */
171 void encode(const GP<ByteStream> &stream,
172 const bool bundled, const bool do_rename) const;
173 /** Tests if directory defines an {\em indirect} document. */
174 inline bool is_indirect(void) const;
175 /** Tests if the directory defines a {\em bundled} document. */
176 inline bool is_bundled(void) const;
177 /** Translates page numbers to file records. */
178 GP<File> page_to_file(int page_num) const;
179 /** Translates file names to file records. */
180 GP<File> name_to_file(const GUTF8String & name) const;
181 /** Translates file IDs to file records. */
182 GP<File> id_to_file(const GUTF8String &id) const;
183 /** Translates file shortcuts to file records. */
184 GP<File> title_to_file(const GUTF8String &title, GPosition spos) const;
185 GP<File> title_to_file(const GUTF8String &title) const;
186 /** Access file record by position. */
187 GP<File> pos_to_file(int fileno, int *ppageno=0) const;
188 /** Returns position of the file in the directory. */
189 int get_file_pos(const File * f) const;
190 /** Returns position of the given page in the directory. */
191 int get_page_pos(int page_num) const;
192 /** Check for duplicate names, and resolve them. */
193 GPList<File> resolve_duplicates(const bool save_as_bundled);
194 /** Returns a copy of the list of file records. */
195 GPList<File> get_files_list(void) const;
196 /** Returns the number of file records. */
197 int get_files_num(void) const;
198 /** Returns the number of file records representing pages. */
199 int get_pages_num(void) const;
200 /** Returns back pointer to the file with #SHARED_ANNO# flag.
201 Note that there may be only one file with shared annotations
202 in any multipage DjVu document. */
203 GP<File> get_shared_anno_file(void) const;
204 /** Changes the title of the file with ID #id#. */
205 void set_file_title(const GUTF8String &id, const GUTF8String &title);
206 /** Changes the name of the file with ID #id#. */
207 void set_file_name(const GUTF8String &id, const GUTF8String &name);
208 /** Inserts the specified file record at the specified position.
209 Specifying #pos# equal to #-1# means to append. The actual position
210 inserted is returned. */
211 int insert_file(const GP<File> & file, int pos=-1);
212 /** Removes a file record with ID #id#. */
213 void delete_file(const GUTF8String &id);
214 private:
215 GCriticalSection class_lock;
216 GPList<File> files_list;
217 GPArray<File> page2file;
218 GPMap<GUTF8String, File> name2file;
219 GPMap<GUTF8String, File> id2file;
220 private: //dummy stuff
221 static void decode(ByteStream *);
222 static void encode(ByteStream *);
223 };
224
225 class DJVUAPI DjVmDir::File : public GPEnabled
226 {
227 public:
228 // Out of the record: INCLUDE below must be zero and PAGE must be one.
229 // This is to avoid problems with the File constructor, which now takes
230 // 'int file_type' as the last argument instead of 'bool is_page'
231
232 /** File type. Possible file types are:
233 \begin{description}
234 \item[PAGE] This is a top level page file. It may include other
235 #INCLUDE#d files, which may in turn be shared between
236 different pages.
237 \item[INCLUDE] This file is included into some other file inside
238 this document.
239 \item[THUMBNAILS] This file contains thumbnails for the document
240 pages.
241 \item[SHARED_ANNO] This file contains annotations shared by
242 all the pages. It's supposed to be included into every page
243 for the annotations to take effect. There may be only one
244 file with shared annotations in a document.
245 \end{description} */
246 enum FILE_TYPE { INCLUDE=0, PAGE=1, THUMBNAILS=2, SHARED_ANNO=3 };
247 protected:
248 /** Default constructor. */
249 File(void);
250
251 public:
create(void)252 static GP<File> create(void) { return new File(); }
253 static GP<File> create(const GUTF8String &load_name,
254 const GUTF8String &save_name, const GUTF8String &title,
255 const FILE_TYPE file_type);
256
257 /** Check for filenames that are not valid for the native encoding,
258 and change them. */
259 const GUTF8String &check_save_name(const bool as_bundled);
260
261 /** File name. The optional file name must be unique and is the name
262 that will be used when the document is saved to an indirect file.
263 If not assigned, the value of #id# will be used for this purpose.
264 By keeping the name in {\em bundled} document we guarantee, that it
265 can be expanded later into {\em indirect} document and files will
266 still have the same names, if the name is legal on a given filesystem.
267 */
268 const GUTF8String &get_save_name(void) const;
269
270 /** File identifier. The encoder assigns a unique identifier to each file
271 in a multipage document. This is the name used when loading files.
272 Indirection chunks in other files (#"INCL"# chunks) may refer to another
273 file using its identifier. */
274 const GUTF8String &get_load_name(void) const;
275 void set_load_name(const GUTF8String &id);
276
277 /** File title. The file title is assigned by the user and may be used as
278 a shortcut for viewing a particular page. Names like #"chapter1"# or
279 #"appendix"# are appropriate. */
280 const GUTF8String &get_title() const;
281 void set_title(const GUTF8String &id);
282
283 /** Reports an ascii string indicating file type. */
284 GUTF8String get_str_type(void) const;
285
286 /** Offset of the file data in a bundled DJVM file. This number is
287 relevant in the {\em bundled} case only when everything is packed into
288 one single file. */
289 int offset;
290
291 /** Size of the file data in a bundled DJVM file. This number is
292 relevant in the {\em bundled} case only when everything is
293 packed into one single file. */
294 int size;
295
296 /** Have we checked the saved file name, to see if it is valid on the
297 local disk? */
298 bool valid_name;
299
300 /** Tests if this file represents a page of the document. */
is_page(void)301 bool is_page(void) const
302 {
303 return (flags & TYPE_MASK)==PAGE;
304 }
305
306 /** Returns #TRUE# if this file is included into some other files of
307 this document. */
is_include(void)308 bool is_include(void) const
309 {
310 return (flags & TYPE_MASK)==INCLUDE;
311 }
312
313 /** Returns #TRUE# if this file contains thumbnails for the document pages. */
is_thumbnails(void)314 bool is_thumbnails(void) const
315 {
316 return (flags & TYPE_MASK)==THUMBNAILS;
317 }
318
319 /** Returns the page number of this file. This function returns
320 #-1# if this file does not represent a page of the document. */
is_shared_anno(void)321 bool is_shared_anno(void) const
322 { return (flags & TYPE_MASK)==SHARED_ANNO; }
323
get_page_num(void)324 int get_page_num(void) const
325 { return page_num; }
326 protected:
327 GUTF8String name;
328 GUTF8String oldname;
329 GUTF8String id;
330 GUTF8String title;
331 void set_save_name(const GUTF8String &name);
332 private:
333 friend class DjVmDir;
334 enum FLAGS_0 { IS_PAGE_0=1, HAS_NAME_0=2, HAS_TITLE_0=4 };
335 enum FLAGS_1 { HAS_NAME=0x80, HAS_TITLE=0x40, TYPE_MASK=0x3f };
336 unsigned char flags;
337 int page_num;
338 };
339
340 inline const GUTF8String &
get_load_name(void)341 DjVmDir::File::get_load_name(void) const
342 { return id; }
343
344 inline const GUTF8String &
get_title()345 DjVmDir::File::get_title() const
346 { return *(title.length()?&title:&id); }
347
348 inline void
set_title(const GUTF8String & xtitle)349 DjVmDir::File::set_title(const GUTF8String &xtitle) { title=xtitle; }
350
351 /** @name Format of the DIRM chunk.
352
353 {\bf Variants} --- There are two versions of the #"DIRM"# chunk format.
354 The version number is identified by the seven low bits of the first byte
355 of the chunk. Version {\bf 0} is obsolete and should never be used. This
356 section describes version {\bf 1}. There are two major multipage DjVu
357 formats supported: {\em bundled} and {\em indirect}. The #"DIRM"# chunk
358 indicates which format is used in the most significant bit of the first
359 byte of the chunk. The document is bundled when this bit is set.
360 Otherwise the document is indirect.
361
362 {\bf Unencoded data} --- The #"DIRM"# chunk is composed some unencoded
363 data followed by \Ref{bzz} encoded data. The unencoded data starts with
364 the version byte and a 16 bit integer representing the number of component
365 files. All integers are encoded with the most significant byte first.
366 \begin{verbatim}
367 BYTE: Flags/Version: 0x<bundled>0000011
368 INT16: Number of component files.
369 \end{verbatim}
370 When the document is a bundled document (i.e. the flag #bundled# is set),
371 this header is followed by the offsets of each of the component files within
372 the #"FORM:DJVM"#. These offsets allow for random component file access.
373 \begin{verbatim}
374 INT32: Offset of first component file.
375 INT32: Offset of second component file.
376 ...
377 INT32: Offset of last component file.
378 \end{verbatim}
379
380 {\bf BZZ encoded data} --- The rest of the chunk is entirely compressed
381 with the BZZ general purpose compressor. We describe now the data fed
382 into (or retrieved from) the BZZ codec (cf. \Ref{BSByteStream}.) First
383 come the sizes and the flags associated with each component file.
384 \begin{verbatim}
385 INT24: Size of the first component file.
386 INT24: Size of the second component file.
387 ...
388 INT24: Size of the last component file.
389 BYTE: Flag byte for the first component file.
390 BYTE: Flag byte for the second component file.
391 ...
392 BYTE: Flag byte for the last component file.
393 \end{verbatim}
394 The flag bytes have the following format:
395 \begin{verbatim}
396 0b<hasname><hastitle>000000 for a file included by other files.
397 0b<hasname><hastitle>000001 for a file representing a page.
398 0b<hasname><hastitle>000010 for a file containing thumbnails.
399 \end{verbatim}
400 Flag #hasname# is set when the name of the file is different from the file
401 ID. Flag #hastitle# is set when the title of the file is different from
402 the file ID. These flags are used to avoid encoding the same string three
403 times. Then come a sequence of zero terminated strings. There are one to
404 three such strings per component file. The first string contains the ID
405 of the component file. The second string contains the name of the
406 component file. It is only present when the flag #hasname# is set. The third
407 one contains the title of the component file. It is only present when the
408 flag #hastitle# is set. The \Ref{bzz} encoding system makes sure that
409 all these strings will be encoded efficiently despite their possible
410 redundancies.
411 \begin{verbatim}
412 ZSTR: ID of the first component file.
413 ZSTR: Name of the first component file (only if #hasname# is set.)
414 ZSTR: Title of the first component file (only if #hastitle# is set.)
415 ...
416 ZSTR: ID of the last component file.
417 ZSTR: Name of the last component file (only if #hasname# is set.)
418 ZSTR: Title of the last component file (only if #hastitle# is set.)
419 \end{verbatim}
420
421 @memo Description of the format of the DIRM chunk. */
422 //@}
423
424
425
426 // -------------- IMPLEMENTATION
427
428
429 inline bool
is_bundled(void)430 DjVmDir::is_bundled(void) const
431 {
432 return ! is_indirect();
433 }
434
435 inline bool
is_indirect(void)436 DjVmDir::is_indirect(void) const
437 {
438 GCriticalSectionLock lock((GCriticalSection *) &class_lock);
439 return ( files_list.size() && files_list[files_list] != 0 &&
440 files_list[files_list]->offset==0 );
441 }
442
443 inline GP<DjVmDir::File>
title_to_file(const GUTF8String & title)444 DjVmDir::title_to_file(const GUTF8String &title) const
445 {
446 GPosition pos;
447 return title_to_file(title, pos);
448 }
449
450
451
452 // ----- THE END
453
454 #ifdef HAVE_NAMESPACES
455 }
456 # ifndef NOT_USING_DJVU_NAMESPACE
457 using namespace DJVU;
458 # endif
459 #endif
460 #endif
461