1 /*
2  * Copyright (c) 2015, 2019, Oracle and/or its affiliates. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  *   - Redistributions of source code must retain the above copyright
9  *     notice, this list of conditions and the following disclaimer.
10  *
11  *   - Redistributions in binary form must reproduce the above copyright
12  *     notice, this list of conditions and the following disclaimer in the
13  *     documentation and/or other materials provided with the distribution.
14  *
15  *   - Neither the name of Oracle nor the names of its
16  *     contributors may be used to endorse or promote products derived
17  *     from this software without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
20  * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
23  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
26  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
27  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
28  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
29  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 #ifndef LIBJIMAGE_IMAGEFILE_HPP
33 #define LIBJIMAGE_IMAGEFILE_HPP
34 
35 #include <assert.h>
36 
37 #include "endian.hpp"
38 #include "inttypes.hpp"
39 
40 // Image files are an alternate file format for storing classes and resources. The
41 // goal is to supply file access which is faster and smaller than the jar format.
42 // It should be noted that unlike jars, information stored in an image is in native
43 // endian format. This allows the image to be mapped into memory without endian
44 // translation.  This also means that images are platform dependent.
45 //
46 // Image files are structured as three sections;
47 //
48 //         +-----------+
49 //         |  Header   |
50 //         +-----------+
51 //         |           |
52 //         |   Index   |
53 //         |           |
54 //         +-----------+
55 //         |           |
56 //         |           |
57 //         | Resources |
58 //         |           |
59 //         |           |
60 //         +-----------+
61 //
62 // The header contains information related to identification and description of
63 // contents.
64 //
65 //         +-------------------------+
66 //         |   Magic (0xCAFEDADA)    |
67 //         +------------+------------+
68 //         | Major Vers | Minor Vers |
69 //         +------------+------------+
70 //         |          Flags          |
71 //         +-------------------------+
72 //         |      Resource Count     |
73 //         +-------------------------+
74 //         |       Table Length      |
75 //         +-------------------------+
76 //         |      Attributes Size    |
77 //         +-------------------------+
78 //         |       Strings Size      |
79 //         +-------------------------+
80 //
81 // Magic - means of identifying validity of the file.  This avoids requiring a
82 //         special file extension.
83 // Major vers, minor vers - differences in version numbers indicate structural
84 //                          changes in the image.
85 // Flags - various image wide flags (future).
86 // Resource count - number of resources in the file.
87 // Table length - the length of lookup tables used in the index.
88 // Attributes size - number of bytes in the region used to store location attribute
89 //                   streams.
90 // Strings size - the size of the region used to store strings used by the
91 //                index and meta data.
92 //
93 // The index contains information related to resource lookup. The algorithm
94 // used for lookup is "A Practical Minimal Perfect Hashing Method"
95 // (http://homepages.dcc.ufmg.br/~nivio/papers/wea05.pdf). Given a path string
96 // in the form /<module>/<package>/<base>.<extension>  return the resource location
97 // information;
98 //
99 //     redirectIndex = hash(path, DEFAULT_SEED) % table_length;
100 //     redirect = redirectTable[redirectIndex];
101 //     if (redirect == 0) return not found;
102 //     locationIndex = redirect < 0 ? -1 - redirect : hash(path, redirect) % table_length;
103 //     location = locationTable[locationIndex];
104 //     if (!verify(location, path)) return not found;
105 //     return location;
106 //
107 // Note: The hash function takes an initial seed value.  A different seed value
108 // usually returns a different result for strings that would otherwise collide with
109 // other seeds. The verify function guarantees the found resource location is
110 // indeed the resource we are looking for.
111 //
112 // The following is the format of the index;
113 //
114 //         +-------------------+
115 //         |   Redirect Table  |
116 //         +-------------------+
117 //         | Attribute Offsets |
118 //         +-------------------+
119 //         |   Attribute Data  |
120 //         +-------------------+
121 //         |      Strings      |
122 //         +-------------------+
123 //
124 // Redirect Table - Array of 32-bit signed values representing actions that
125 //                  should take place for hashed strings that map to that
126 //                  value.  Negative values indicate no hash collision and can be
127 //                  quickly converted to indices into attribute offsets.  Positive
128 //                  values represent a new seed for hashing an index into attribute
129 //                  offsets.  Zero indicates not found.
130 // Attribute Offsets - Array of 32-bit unsigned values representing offsets into
131 //                     attribute data.  Attribute offsets can be iterated to do a
132 //                     full survey of resources in the image.  Offset of zero
133 //                     indicates no attributes.
134 // Attribute Data - Bytes representing compact attribute data for locations. (See
135 //                  comments in ImageLocation.)
136 // Strings - Collection of zero terminated UTF-8 strings used by the index and
137 //           image meta data.  Each string is accessed by offset.  Each string is
138 //           unique.  Offset zero is reserved for the empty string.
139 //
140 // Note that the memory mapped index assumes 32 bit alignment of each component
141 // in the index.
142 //
143 // Endianness of an image.
144 // An image booted by hotspot is always in native endian.  However, it is possible
145 // to read (by the JDK) in alternate endian format.  Primarily, this is during
146 // cross platform scenarios.  Ex, where javac needs to read an embedded image
147 // to access classes for crossing compilation.
148 //
149 
150 class ImageFileReader; // forward declaration
151 
152 // Manage image file string table.
153 class ImageStrings {
154 private:
155     u1* _data; // Data bytes for strings.
156     u4 _size;  // Number of bytes in the string table.
157 public:
158     enum {
159         // Not found result from find routine.
160         NOT_FOUND = -1,
161         // Prime used to generate hash for Perfect Hashing.
162         HASH_MULTIPLIER = 0x01000193
163     };
164 
ImageStrings(u1 * data,u4 size)165     ImageStrings(u1* data, u4 size) : _data(data), _size(size) {}
166 
167     // Return the UTF-8 string beginning at offset.
get(u4 offset) const168     inline const char* get(u4 offset) const {
169         assert(offset < _size && "offset exceeds string table size");
170         return (const char*)(_data + offset);
171     }
172 
173     // Compute the Perfect Hashing hash code for the supplied UTF-8 string.
hash_code(const char * string)174     inline static u4 hash_code(const char* string) {
175         return hash_code(string, HASH_MULTIPLIER);
176     }
177 
178     // Compute the Perfect Hashing hash code for the supplied string, starting at seed.
179     static s4 hash_code(const char* string, s4 seed);
180 
181     // Match up a string in a perfect hash table.    Result still needs validation
182     // for precise match.
183     static s4 find(Endian* endian, const char* name, s4* redirect, u4 length);
184 
185     // Test to see if UTF-8 string begins with the start UTF-8 string.  If so,
186     // return non-NULL address of remaining portion of string.  Otherwise, return
187     // NULL.    Used to test sections of a path without copying from image string
188     // table.
189     static const char* starts_with(const char* string, const char* start);
190 
191     // Test to see if UTF-8 string begins with start char.  If so, return non-NULL
192     // address of remaining portion of string.  Otherwise, return NULL.  Used
193     // to test a character of a path without copying.
starts_with(const char * string,const char ch)194     inline static const char* starts_with(const char* string, const char ch) {
195         return *string == ch ? string + 1 : NULL;
196     }
197 };
198 
199 // Manage image file location attribute data.    Within an image, a location's
200 // attributes are compressed into a stream of bytes.    An attribute stream is
201 // composed of individual attribute sequences.  Each attribute sequence begins with
202 // a header byte containing the attribute 'kind' (upper 5 bits of header) and the
203 // 'length' less 1 (lower 3 bits of header) of bytes that follow containing the
204 // attribute value.  Attribute values present as most significant byte first.
205 //
206 // Ex. Container offset (ATTRIBUTE_OFFSET) 0x33562 would be represented as 0x22
207 // (kind = 4, length = 3), 0x03, 0x35, 0x62.
208 //
209 // An attribute stream is terminated with a header kind of ATTRIBUTE_END (header
210 // byte of zero.)
211 //
212 // ImageLocation inflates the stream into individual values stored in the long
213 // array _attributes. This allows an attribute value can be quickly accessed by
214 // direct indexing. Unspecified values default to zero.
215 //
216 // Notes:
217 //  - Even though ATTRIBUTE_END is used to mark the end of the attribute stream,
218 //      streams will contain zero byte values to represent lesser significant bits.
219 //      Thus, detecting a zero byte is not sufficient to detect the end of an attribute
220 //      stream.
221 //  - ATTRIBUTE_OFFSET represents the number of bytes from the beginning of the region
222 //      storing the resources.  Thus, in an image this represents the number of bytes
223 //      after the index.
224 //  - Currently, compressed resources are represented by having a non-zero
225 //      ATTRIBUTE_COMPRESSED value.  This represents the number of bytes stored in the
226 //      image, and the value of ATTRIBUTE_UNCOMPRESSED represents number of bytes of the
227 //      inflated resource in memory. If the ATTRIBUTE_COMPRESSED is zero then the value
228 //      of ATTRIBUTE_UNCOMPRESSED represents both the number of bytes in the image and
229 //      in memory.  In the future, additional compression techniques will be used and
230 //      represented differently.
231 //  - Package strings include trailing slash and extensions include prefix period.
232 //
233 class ImageLocation {
234 public:
235     enum {
236         ATTRIBUTE_END,                  // End of attribute stream marker
237         ATTRIBUTE_MODULE,               // String table offset of module name
238         ATTRIBUTE_PARENT,               // String table offset of resource path parent
239         ATTRIBUTE_BASE,                 // String table offset of resource path base
240         ATTRIBUTE_EXTENSION,        // String table offset of resource path extension
241         ATTRIBUTE_OFFSET,               // Container byte offset of resource
242         ATTRIBUTE_COMPRESSED,       // In image byte size of the compressed resource
243         ATTRIBUTE_UNCOMPRESSED, // In memory byte size of the uncompressed resource
244         ATTRIBUTE_COUNT                 // Number of attribute kinds
245     };
246 
247 private:
248     // Values of inflated attributes.
249     u8 _attributes[ATTRIBUTE_COUNT];
250 
251     // Return the attribute value number of bytes.
attribute_length(u1 data)252     inline static u1 attribute_length(u1 data) {
253         return (data & 0x7) + 1;
254     }
255 
256     // Return the attribute kind.
attribute_kind(u1 data)257     inline static u1 attribute_kind(u1 data) {
258         u1 kind = data >> 3;
259         assert(kind < ATTRIBUTE_COUNT && "invalid attribute kind");
260         return kind;
261     }
262 
263     // Return the attribute length.
attribute_value(u1 * data,u1 n)264     inline static u8 attribute_value(u1* data, u1 n) {
265         assert(0 < n && n <= 8 && "invalid attribute value length");
266         u8 value = 0;
267         // Most significant bytes first.
268         for (u1 i = 0; i < n; i++) {
269             value <<= 8;
270             value |= data[i];
271         }
272         return value;
273     }
274 
275 public:
ImageLocation()276     ImageLocation() {
277         clear_data();
278     }
279 
ImageLocation(u1 * data)280     ImageLocation(u1* data) {
281         clear_data();
282         set_data(data);
283     }
284 
285     // Inflates the attribute stream into individual values stored in the long
286     // array _attributes. This allows an attribute value to be quickly accessed by
287     // direct indexing. Unspecified values default to zero.
288     void set_data(u1* data);
289 
290     // Zero all attribute values.
291     void clear_data();
292 
293     // Retrieve an attribute value from the inflated array.
get_attribute(u1 kind) const294     inline u8 get_attribute(u1 kind) const {
295         assert(ATTRIBUTE_END < kind && kind < ATTRIBUTE_COUNT && "invalid attribute kind");
296         return _attributes[kind];
297     }
298 
299     // Retrieve an attribute string value from the inflated array.
get_attribute(u4 kind,const ImageStrings & strings) const300     inline const char* get_attribute(u4 kind, const ImageStrings& strings) const {
301         return strings.get((u4)get_attribute(kind));
302     }
303 };
304 
305 //
306 // Manage the image module meta data.
307 class ImageModuleData {
308     const ImageFileReader* _image_file; // Source image file
309     Endian* _endian;                    // Endian handler
310 
311 public:
312     ImageModuleData(const ImageFileReader* image_file);
313     ~ImageModuleData();
314 
315     // Return the module in which a package resides.    Returns NULL if not found.
316     const char* package_to_module(const char* package_name);
317 };
318 
319 // Image file header, starting at offset 0.
320 class ImageHeader {
321 private:
322     u4 _magic;          // Image file marker
323     u4 _version;        // Image file major version number
324     u4 _flags;          // Image file flags
325     u4 _resource_count; // Number of resources in file
326     u4 _table_length;   // Number of slots in index tables
327     u4 _locations_size; // Number of bytes in attribute table
328     u4 _strings_size;   // Number of bytes in string table
329 
330 public:
magic() const331     u4 magic() const { return _magic; }
magic(Endian * endian) const332     u4 magic(Endian* endian) const { return endian->get(_magic); }
set_magic(Endian * endian,u4 magic)333     void set_magic(Endian* endian, u4 magic) { return endian->set(_magic, magic); }
334 
major_version(Endian * endian) const335     u4 major_version(Endian* endian) const { return endian->get(_version) >> 16; }
minor_version(Endian * endian) const336     u4 minor_version(Endian* endian) const { return endian->get(_version) & 0xFFFF; }
set_version(Endian * endian,u4 major_version,u4 minor_version)337     void set_version(Endian* endian, u4 major_version, u4 minor_version) {
338         return endian->set(_version, major_version << 16 | minor_version);
339     }
340 
flags(Endian * endian) const341     u4 flags(Endian* endian) const { return endian->get(_flags); }
set_flags(Endian * endian,u4 value)342     void set_flags(Endian* endian, u4 value) { return endian->set(_flags, value); }
343 
resource_count(Endian * endian) const344     u4 resource_count(Endian* endian) const { return endian->get(_resource_count); }
set_resource_count(Endian * endian,u4 count)345     void set_resource_count(Endian* endian, u4 count) { return endian->set(_resource_count, count); }
346 
table_length(Endian * endian) const347     u4 table_length(Endian* endian) const { return endian->get(_table_length); }
set_table_length(Endian * endian,u4 count)348     void set_table_length(Endian* endian, u4 count) { return endian->set(_table_length, count); }
349 
locations_size(Endian * endian) const350     u4 locations_size(Endian* endian) const { return endian->get(_locations_size); }
set_locations_size(Endian * endian,u4 size)351     void set_locations_size(Endian* endian, u4 size) { return endian->set(_locations_size, size); }
352 
strings_size(Endian * endian) const353     u4 strings_size(Endian* endian) const { return endian->get(_strings_size); }
set_strings_size(Endian * endian,u4 size)354     void set_strings_size(Endian* endian, u4 size) { return endian->set(_strings_size, size); }
355 };
356 
357 // Max path length limit independent of platform.    Windows max path is 1024,
358 // other platforms use 4096.    The JCK fails several tests when 1024 is used.
359 #define IMAGE_MAX_PATH 4096
360 
361 class ImageFileReader;
362 
363 // Manage a table of open image files.  This table allows multiple access points
364 // to share an open image.
365 class ImageFileReaderTable {
366 private:
367     const static u4 _growth = 8; // Growth rate of the table
368     u4 _count;                   // Number of entries in the table
369     u4 _max;                     // Maximum number of entries allocated
370     ImageFileReader** _table;    // Growable array of entries
371 
372 public:
373     ImageFileReaderTable();
374     ~ImageFileReaderTable();
375 
376     // Return the number of entries.
count()377     inline u4 count() { return _count; }
378 
379     // Return the ith entry from the table.
get(u4 i)380     inline ImageFileReader* get(u4 i) { return _table[i]; }
381 
382     // Add a new image entry to the table.
383     void add(ImageFileReader* image);
384 
385     // Remove an image entry from the table.
386     void remove(ImageFileReader* image);
387 
388     // Determine if image entry is in table.
389     bool contains(ImageFileReader* image);
390 };
391 
392 // Manage the image file.
393 // ImageFileReader manages the content of an image file.
394 // Initially, the header of the image file is read for validation.  If valid,
395 // values in the header are used calculate the size of the image index.  The
396 // index is then memory mapped to allow load on demand and sharing.  The
397 // -XX:+MemoryMapImage flag determines if the entire file is loaded (server use.)
398 // An image can be used by Hotspot and multiple reference points in the JDK, thus
399 // it is desirable to share a reader.    To accomodate sharing, a share table is
400 // defined (see ImageFileReaderTable in imageFile.cpp)  To track the number of
401 // uses, ImageFileReader keeps a use count (_use).  Use is incremented when
402 // 'opened' by reference point and decremented when 'closed'.    Use of zero
403 // leads the ImageFileReader to be actually closed and discarded.
404 class ImageFileReader {
405 friend class ImageFileReaderTable;
406 private:
407     // Manage a number of image files such that an image can be shared across
408     // multiple uses (ex. loader.)
409     static ImageFileReaderTable _reader_table;
410 
411     // true if image should be fully memory mapped.
412     static bool memory_map_image;
413 
414     char* _name;         // Name of image
415     s4 _use;             // Use count
416     int _fd;             // File descriptor
417     Endian* _endian;     // Endian handler
418     u8 _file_size;       // File size in bytes
419     ImageHeader _header; // Image header
420     size_t _index_size;  // Total size of index
421     u1* _index_data;     // Raw index data
422     s4* _redirect_table; // Perfect hash redirect table
423     u4* _offsets_table;  // Location offset table
424     u1* _location_bytes; // Location attributes
425     u1* _string_bytes;   // String table
426     ImageModuleData *module_data;       // The ImageModuleData for this image
427 
428     ImageFileReader(const char* name, bool big_endian);
429     ~ImageFileReader();
430 
431     // Compute number of bytes in image file index.
index_size()432     inline size_t index_size() {
433         return sizeof(ImageHeader) +
434             table_length() * sizeof(u4) * 2 + locations_size() + strings_size();
435     }
436 
437 public:
438     enum {
439         // Image file marker.
440         IMAGE_MAGIC = 0xCAFEDADA,
441         // Endian inverted Image file marker.
442         IMAGE_MAGIC_INVERT = 0xDADAFECA,
443         // Image file major version number.
444         MAJOR_VERSION = 1,
445         // Image file minor version number.
446         MINOR_VERSION = 0
447     };
448 
449     // Locate an image if file already open.
450     static ImageFileReader* find_image(const char* name);
451 
452     // Open an image file, reuse structure if file already open.
453     static ImageFileReader* open(const char* name, bool big_endian = Endian::is_big_endian());
454 
455     // Close an image file if the file is not in use elsewhere.
456     static void close(ImageFileReader *reader);
457 
458     // Return an id for the specifed ImageFileReader.
459     static u8 reader_to_ID(ImageFileReader *reader);
460 
461     // Validate the image id.
462     static bool id_check(u8 id);
463 
464     // Return an id for the specifed ImageFileReader.
465     static ImageFileReader* id_to_reader(u8 id);
466 
467     // Open image file for read access.
468     bool open();
469 
470     // Close image file.
471     void close();
472 
473     // Read directly from the file.
474     bool read_at(u1* data, u8 size, u8 offset) const;
475 
endian() const476     inline Endian* endian() const { return _endian; }
477 
478     // Retrieve name of image file.
name() const479     inline const char* name() const {
480         return _name;
481     }
482 
483     // Retrieve size of image file.
file_size() const484     inline u8 file_size() const {
485         return _file_size;
486     }
487 
488     // Retrieve the size of the mapped image.
map_size() const489     inline u8 map_size() const {
490         return (u8)(memory_map_image ? _file_size : _index_size);
491     }
492 
493     // Return first address of index data.
get_index_address() const494     inline u1* get_index_address() const {
495         return _index_data;
496     }
497 
498     // Return first address of resource data.
get_data_address() const499     inline u1* get_data_address() const {
500         return _index_data + _index_size;
501     }
502 
503     // Get the size of the index data.
get_index_size() const504     size_t get_index_size() const {
505         return _index_size;
506     }
507 
table_length() const508     inline u4 table_length() const {
509         return _header.table_length(_endian);
510     }
511 
locations_size() const512     inline u4 locations_size() const {
513         return _header.locations_size(_endian);
514     }
515 
strings_size() const516     inline u4 strings_size()const    {
517         return _header.strings_size(_endian);
518     }
519 
offsets_table() const520     inline u4* offsets_table() const {
521         return _offsets_table;
522     }
523 
524     // Increment use count.
inc_use()525     inline void inc_use() {
526         _use++;
527     }
528 
529     // Decrement use count.
dec_use()530     inline bool dec_use() {
531         return --_use == 0;
532     }
533 
534     // Return a string table accessor.
get_strings() const535     inline const ImageStrings get_strings() const {
536         return ImageStrings(_string_bytes, _header.strings_size(_endian));
537     }
538 
539     // Return location attribute stream at offset.
get_location_offset_data(u4 offset) const540     inline u1* get_location_offset_data(u4 offset) const {
541         assert((u4)offset < _header.locations_size(_endian) &&
542                             "offset exceeds location attributes size");
543         return offset != 0 ? _location_bytes + offset : NULL;
544     }
545 
546     // Return location attribute stream for location i.
get_location_data(u4 index) const547     inline u1* get_location_data(u4 index) const {
548         return get_location_offset_data(get_location_offset(index));
549     }
550 
551     // Return the location offset for index.
get_location_offset(u4 index) const552     inline u4 get_location_offset(u4 index) const {
553         assert((u4)index < _header.table_length(_endian) &&
554                             "index exceeds location count");
555         return _endian->get(_offsets_table[index]);
556     }
557 
558     // Find the location attributes associated with the path.    Returns true if
559     // the location is found, false otherwise.
560     bool find_location(const char* path, ImageLocation& location) const;
561 
562     // Find the location index and size associated with the path.
563     // Returns the location index and size if the location is found,
564     // ImageFileReader::NOT_FOUND otherwise.
565     u4 find_location_index(const char* path, u8 *size) const;
566 
567     // Verify that a found location matches the supplied path.
568     bool verify_location(ImageLocation& location, const char* path) const;
569 
570     // Return the resource for the supplied location index.
571     void get_resource(u4 index, u1* uncompressed_data) const;
572 
573     // Return the resource for the supplied path.
574     void get_resource(ImageLocation& location, u1* uncompressed_data) const;
575 
576     // Return the ImageModuleData for this image
577     ImageModuleData * get_image_module_data();
578 
579 };
580 #endif // LIBJIMAGE_IMAGEFILE_HPP
581