1 //
2 // Copyright (C) 2020 Greg Landrum
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10
11 // details of how to handle the PNG file taken from OpenBabel's PNG handling
12 // code:
13 // https://github.com/openbabel/openbabel/blob/master/src/formats/pngformat.cpp
14
15 #include "PNGParser.h"
16 #include <GraphMol/MolPickler.h>
17 #include <RDGeneral/FileParseException.h>
18 #include <RDGeneral/StreamOps.h>
19 #include <vector>
20 #include <boost/crc.hpp>
21 #include <boost/algorithm/string.hpp>
22
23 #include "FileParsers.h"
24 #ifdef RDK_USE_BOOST_IOSTREAMS
25 #include <zlib.h>
26 #include <boost/iostreams/filtering_streambuf.hpp>
27 #include <boost/iostreams/copy.hpp>
28 #include <boost/iostreams/filter/zlib.hpp>
29 #endif
30
31 namespace RDKit {
32
33 namespace PNGData {
34 const std::string smilesTag = "SMILES";
35 const std::string molTag = "MOL";
36 const std::string pklTag = "rdkitPKL";
37 } // namespace PNGData
38
39 namespace {
40 std::vector<unsigned char> pngHeader = {137, 80, 78, 71, 13, 10, 26, 10};
checkPNGHeader(std::istream & inStream)41 bool checkPNGHeader(std::istream &inStream) {
42 for (auto byte : pngHeader) {
43 unsigned char ibyte;
44 inStream.read((char *)&ibyte, 1);
45 if (ibyte != byte) {
46 return false;
47 }
48 }
49 return true;
50 }
51
52 #ifdef RDK_USE_BOOST_IOSTREAMS
uncompressString(const std::string & ztext)53 std::string uncompressString(const std::string &ztext) {
54 std::stringstream compressed(ztext);
55 std::stringstream uncompressed;
56 boost::iostreams::filtering_streambuf<boost::iostreams::input> bioOutstream;
57 bioOutstream.push(boost::iostreams::zlib_decompressor());
58 bioOutstream.push(compressed);
59 boost::iostreams::copy(bioOutstream, uncompressed);
60 return uncompressed.str();
61 }
compressString(const std::string & text)62 std::string compressString(const std::string &text) {
63 std::stringstream uncompressed(text);
64 std::stringstream compressed;
65 boost::iostreams::filtering_streambuf<boost::iostreams::input> bioOutstream;
66 bioOutstream.push(boost::iostreams::zlib_compressor());
67 bioOutstream.push(uncompressed);
68 boost::iostreams::copy(bioOutstream, compressed);
69 return compressed.str();
70 }
71
72 #endif
73 } // namespace
74
PNGStreamToMetadata(std::istream & inStream)75 std::vector<std::pair<std::string, std::string>> PNGStreamToMetadata(
76 std::istream &inStream) {
77 // confirm that it's a PNG file:
78 if (!checkPNGHeader(inStream)) {
79 throw FileParseException("PNG header not recognized");
80 }
81 std::vector<std::pair<std::string, std::string>> res;
82 // the file is organized in chunks. Read through them until we find the tEXt
83 // block FIX: at some point we'll want to also include zEXt here, but that
84 // requires zlib
85 while (inStream) {
86 std::uint32_t blockLen;
87 inStream.read((char *)&blockLen, sizeof(blockLen));
88 if (inStream.fail()) {
89 throw FileParseException("error when reading from PNG");
90 }
91 // PNG is big endian, make sure we handle the order correctly
92 blockLen = EndianSwapBytes<BIG_ENDIAN_ORDER, HOST_ENDIAN_ORDER>(blockLen);
93 char bytes[4];
94 inStream.read(bytes, 4);
95 if (inStream.fail()) {
96 throw FileParseException("error when reading from PNG");
97 }
98 auto beginBlock = inStream.tellg();
99 if (bytes[0] == 'I' && bytes[1] == 'E' && bytes[2] == 'N' &&
100 bytes[3] == 'D') {
101 break;
102 }
103 #ifndef RDK_USE_BOOST_IOSTREAMS
104 bool alreadyWarned = false;
105 #endif
106 if (blockLen > 0 &&
107 ((bytes[0] == 't' && bytes[1] == 'E') ||
108 (bytes[0] == 'z' && bytes[1] == 'T')) &&
109 bytes[2] == 'X' && bytes[3] == 't') {
110 // in a tEXt block, read the key:
111 std::string key;
112 std::getline(inStream, key, '\0');
113 if (inStream.fail()) {
114 throw FileParseException("error when reading from PNG");
115 }
116 auto dataLen = blockLen - key.size() - 1;
117 std::string value;
118 if (bytes[0] == 't') {
119 value.resize(dataLen);
120 inStream.read(&value.front(), dataLen);
121 if (inStream.fail()) {
122 throw FileParseException("error when reading from PNG");
123 }
124 } else if (bytes[0] == 'z') {
125 #ifdef RDK_USE_BOOST_IOSTREAMS
126 value.resize(dataLen);
127 inStream.read(&value.front(), dataLen);
128 if (inStream.fail()) {
129 throw FileParseException("error when reading from PNG");
130 }
131 value = uncompressString(value.substr(1, dataLen - 1));
132 #else
133 value = "";
134 if (!alreadyWarned) {
135 BOOST_LOG(rdWarningLog)
136 << "compressed metadata found in PNG, but the RDKit was not "
137 "compiled with support for this. Skipping it."
138 << std::endl;
139 alreadyWarned = true;
140 }
141 #endif
142 } else {
143 CHECK_INVARIANT(0, "impossible value");
144 }
145 if (!value.empty()) {
146 res.push_back(std::make_pair(key, value));
147 }
148 }
149 inStream.seekg(beginBlock);
150 inStream.ignore(blockLen + 4); // the extra 4 bytes are the CRC
151 }
152
153 return res;
154 };
155
addMetadataToPNGStream(std::istream & inStream,const std::vector<std::pair<std::string,std::string>> & metadata,bool compressed)156 std::string addMetadataToPNGStream(
157 std::istream &inStream,
158 const std::vector<std::pair<std::string, std::string>> &metadata,
159 bool compressed) {
160 #ifndef RDK_USE_BOOST_IOSTREAMS
161 compressed = false;
162 #endif
163 // confirm that it's a PNG file:
164 if (!checkPNGHeader(inStream)) {
165 throw FileParseException("PNG header not recognized");
166 }
167 std::stringstream res;
168 // write the header
169 for (auto byte : pngHeader) {
170 res << byte;
171 }
172
173 // copy over everything up to IEND
174 bool foundEnd = false;
175 std::uint32_t finalCRC;
176 while (inStream) {
177 std::uint32_t blockLen;
178 inStream.read((char *)&blockLen, sizeof(blockLen));
179 char bytes[4];
180 inStream.read(bytes, 4);
181 if (bytes[0] == 'I' && bytes[1] == 'E' && bytes[2] == 'N' &&
182 bytes[3] == 'D') {
183 foundEnd = true;
184 inStream.read((char *)&finalCRC, sizeof(finalCRC));
185 break;
186 }
187 res.write((char *)&blockLen, sizeof(blockLen));
188 res.write(bytes, 4);
189 // PNG is big endian, make sure we handle the order correctly
190 blockLen = EndianSwapBytes<BIG_ENDIAN_ORDER, HOST_ENDIAN_ORDER>(blockLen);
191 std::string block(blockLen + 4, 0);
192 inStream.read((char *)&block.front(),
193 blockLen + 4); // the extra 4 bytes are the CRC
194 res.write(block.c_str(), blockLen + 4);
195 }
196 if (!foundEnd) {
197 throw FileParseException("did not find IEND block in PNG");
198 }
199
200 // write out the metadata:
201 for (const auto &pr : metadata) {
202 std::stringstream blk;
203 if (!compressed) {
204 blk.write("tEXt", 4);
205 // write the name along with a zero
206 blk.write(pr.first.c_str(), pr.first.size() + 1);
207 blk.write(pr.second.c_str(), pr.second.size());
208 } else {
209 #ifdef RDK_USE_BOOST_IOSTREAMS
210 blk.write("zTXt", 4);
211 // write the name along with a zero
212 blk.write(pr.first.c_str(), pr.first.size() + 1);
213 // write the compressed data
214 // first a zero for the "compression method":
215 blk.write("\0", 1);
216 auto dest = compressString(pr.second);
217 blk.write((const char *)dest.c_str(), dest.size());
218 #else
219 // we shouldn't get here since we disabled compressed at the beginning of
220 // the function, but check to be sure
221 CHECK_INVARIANT(0, "compression support not enabled");
222 #endif
223 }
224 auto blob = blk.str();
225 std::uint32_t blksize =
226 blob.size() - 4; // we don't include the tag in the size;
227 boost::crc_32_type crc;
228 crc.process_bytes((void const *)blob.c_str(), blob.size());
229 std::uint32_t crcVal = crc.checksum();
230 // PNG is big endian, make sure we handle the order correctly
231 blksize = EndianSwapBytes<HOST_ENDIAN_ORDER, BIG_ENDIAN_ORDER>(blksize);
232
233 res.write((char *)&blksize, sizeof(blksize));
234 res.write(blob.c_str(), blob.size());
235 // PNG is big endian, make sure we handle the order correctly
236 crcVal = EndianSwapBytes<HOST_ENDIAN_ORDER, BIG_ENDIAN_ORDER>(crcVal);
237 res.write((char *)&crcVal, sizeof(crcVal));
238 }
239
240 // write out the IEND block
241 std::uint32_t blksize = 0;
242 res.write((char *)&blksize, sizeof(blksize));
243
244 const char *endTag = "IEND";
245 res.write(endTag, 4);
246 res.write((char *)&finalCRC, sizeof(finalCRC));
247 return res.str();
248 }
249
addMolToPNGStream(const ROMol & mol,std::istream & iStream,bool includePkl,bool includeSmiles,bool includeMol)250 std::string addMolToPNGStream(const ROMol &mol, std::istream &iStream,
251 bool includePkl, bool includeSmiles,
252 bool includeMol) {
253 std::vector<std::pair<std::string, std::string>> metadata;
254 if (includePkl) {
255 std::string pkl;
256 MolPickler::pickleMol(mol, pkl);
257 metadata.push_back(std::make_pair(augmentTagName(PNGData::pklTag), pkl));
258 }
259 if (includeSmiles) {
260 std::string smi = MolToCXSmiles(mol);
261 metadata.push_back(std::make_pair(augmentTagName(PNGData::smilesTag), smi));
262 }
263 if (includeMol) {
264 bool includeStereo = true;
265 int confId = -1;
266 bool kekulize = false;
267 std::string mb = MolToMolBlock(mol, includeStereo, confId, kekulize);
268 metadata.push_back(std::make_pair(augmentTagName(PNGData::molTag), mb));
269 }
270 return addMetadataToPNGStream(iStream, metadata);
271 };
272
PNGStreamToMol(std::istream & inStream,const SmilesParserParams & params)273 ROMol *PNGStreamToMol(std::istream &inStream,
274 const SmilesParserParams ¶ms) {
275 ROMol *res = nullptr;
276 auto metadata = PNGStreamToMetadata(inStream);
277 bool formatFound = false;
278 for (const auto &pr : metadata) {
279 if (boost::starts_with(pr.first, PNGData::pklTag)) {
280 res = new ROMol(pr.second);
281 formatFound = true;
282 } else if (boost::starts_with(pr.first, PNGData::smilesTag)) {
283 res = SmilesToMol(pr.second, params);
284 formatFound = true;
285 } else if (boost::starts_with(pr.first, PNGData::molTag)) {
286 res = MolBlockToMol(pr.second, params.sanitize, params.removeHs);
287 formatFound = true;
288 }
289 if (formatFound) {
290 break;
291 }
292 }
293 if (!formatFound) {
294 throw FileParseException("No suitable metadata found.");
295 }
296 return res;
297 }
298
PNGStreamToMols(std::istream & inStream,const std::string & tagToUse,const SmilesParserParams & params)299 std::vector<std::unique_ptr<ROMol>> PNGStreamToMols(
300 std::istream &inStream, const std::string &tagToUse,
301 const SmilesParserParams ¶ms) {
302 std::vector<std::unique_ptr<ROMol>> res;
303 auto metadata = PNGStreamToMetadata(inStream);
304 for (const auto &pr : metadata) {
305 if (!boost::starts_with(pr.first, tagToUse)) {
306 continue;
307 }
308 if (boost::starts_with(pr.first, PNGData::pklTag)) {
309 res.emplace_back(new ROMol(pr.second));
310 } else if (boost::starts_with(pr.first, PNGData::smilesTag)) {
311 res.emplace_back(SmilesToMol(pr.second, params));
312 } else if (boost::starts_with(pr.first, PNGData::molTag)) {
313 res.emplace_back(
314 MolBlockToMol(pr.second, params.sanitize, params.removeHs));
315 }
316 }
317 return res;
318 }
319
320 } // namespace RDKit
321