1 //
2 //  Copyright (C) 2020 Greg Landrum
3 //
4 //   @@ All Rights Reserved @@
5 //  This file is part of the RDKit.
6 //  The contents are covered by the terms of the BSD license
7 //  which is included in the file license.txt, found at the root
8 //  of the RDKit source tree.
9 //
10 
11 // details of how to handle the PNG file taken from OpenBabel's PNG handling
12 // code:
13 // https://github.com/openbabel/openbabel/blob/master/src/formats/pngformat.cpp
14 
15 #include "PNGParser.h"
16 #include <GraphMol/MolPickler.h>
17 #include <RDGeneral/FileParseException.h>
18 #include <RDGeneral/StreamOps.h>
19 #include <vector>
20 #include <boost/crc.hpp>
21 #include <boost/algorithm/string.hpp>
22 
23 #include "FileParsers.h"
24 #ifdef RDK_USE_BOOST_IOSTREAMS
25 #include <zlib.h>
26 #include <boost/iostreams/filtering_streambuf.hpp>
27 #include <boost/iostreams/copy.hpp>
28 #include <boost/iostreams/filter/zlib.hpp>
29 #endif
30 
31 namespace RDKit {
32 
33 namespace PNGData {
34 const std::string smilesTag = "SMILES";
35 const std::string molTag = "MOL";
36 const std::string pklTag = "rdkitPKL";
37 }  // namespace PNGData
38 
39 namespace {
40 std::vector<unsigned char> pngHeader = {137, 80, 78, 71, 13, 10, 26, 10};
checkPNGHeader(std::istream & inStream)41 bool checkPNGHeader(std::istream &inStream) {
42   for (auto byte : pngHeader) {
43     unsigned char ibyte;
44     inStream.read((char *)&ibyte, 1);
45     if (ibyte != byte) {
46       return false;
47     }
48   }
49   return true;
50 }
51 
52 #ifdef RDK_USE_BOOST_IOSTREAMS
uncompressString(const std::string & ztext)53 std::string uncompressString(const std::string &ztext) {
54   std::stringstream compressed(ztext);
55   std::stringstream uncompressed;
56   boost::iostreams::filtering_streambuf<boost::iostreams::input> bioOutstream;
57   bioOutstream.push(boost::iostreams::zlib_decompressor());
58   bioOutstream.push(compressed);
59   boost::iostreams::copy(bioOutstream, uncompressed);
60   return uncompressed.str();
61 }
compressString(const std::string & text)62 std::string compressString(const std::string &text) {
63   std::stringstream uncompressed(text);
64   std::stringstream compressed;
65   boost::iostreams::filtering_streambuf<boost::iostreams::input> bioOutstream;
66   bioOutstream.push(boost::iostreams::zlib_compressor());
67   bioOutstream.push(uncompressed);
68   boost::iostreams::copy(bioOutstream, compressed);
69   return compressed.str();
70 }
71 
72 #endif
73 }  // namespace
74 
PNGStreamToMetadata(std::istream & inStream)75 std::vector<std::pair<std::string, std::string>> PNGStreamToMetadata(
76     std::istream &inStream) {
77   // confirm that it's a PNG file:
78   if (!checkPNGHeader(inStream)) {
79     throw FileParseException("PNG header not recognized");
80   }
81   std::vector<std::pair<std::string, std::string>> res;
82   // the file is organized in chunks. Read through them until we find the tEXt
83   // block FIX: at some point we'll want to also include zEXt here, but that
84   // requires zlib
85   while (inStream) {
86     std::uint32_t blockLen;
87     inStream.read((char *)&blockLen, sizeof(blockLen));
88     if (inStream.fail()) {
89       throw FileParseException("error when reading from PNG");
90     }
91     // PNG is big endian, make sure we handle the order correctly
92     blockLen = EndianSwapBytes<BIG_ENDIAN_ORDER, HOST_ENDIAN_ORDER>(blockLen);
93     char bytes[4];
94     inStream.read(bytes, 4);
95     if (inStream.fail()) {
96       throw FileParseException("error when reading from PNG");
97     }
98     auto beginBlock = inStream.tellg();
99     if (bytes[0] == 'I' && bytes[1] == 'E' && bytes[2] == 'N' &&
100         bytes[3] == 'D') {
101       break;
102     }
103 #ifndef RDK_USE_BOOST_IOSTREAMS
104     bool alreadyWarned = false;
105 #endif
106     if (blockLen > 0 &&
107         ((bytes[0] == 't' && bytes[1] == 'E') ||
108          (bytes[0] == 'z' && bytes[1] == 'T')) &&
109         bytes[2] == 'X' && bytes[3] == 't') {
110       // in a tEXt block, read the key:
111       std::string key;
112       std::getline(inStream, key, '\0');
113       if (inStream.fail()) {
114         throw FileParseException("error when reading from PNG");
115       }
116       auto dataLen = blockLen - key.size() - 1;
117       std::string value;
118       if (bytes[0] == 't') {
119         value.resize(dataLen);
120         inStream.read(&value.front(), dataLen);
121         if (inStream.fail()) {
122           throw FileParseException("error when reading from PNG");
123         }
124       } else if (bytes[0] == 'z') {
125 #ifdef RDK_USE_BOOST_IOSTREAMS
126         value.resize(dataLen);
127         inStream.read(&value.front(), dataLen);
128         if (inStream.fail()) {
129           throw FileParseException("error when reading from PNG");
130         }
131         value = uncompressString(value.substr(1, dataLen - 1));
132 #else
133         value = "";
134         if (!alreadyWarned) {
135           BOOST_LOG(rdWarningLog)
136               << "compressed metadata found in PNG, but the RDKit was not "
137                  "compiled with support for this. Skipping it."
138               << std::endl;
139           alreadyWarned = true;
140         }
141 #endif
142       } else {
143         CHECK_INVARIANT(0, "impossible value");
144       }
145       if (!value.empty()) {
146         res.push_back(std::make_pair(key, value));
147       }
148     }
149     inStream.seekg(beginBlock);
150     inStream.ignore(blockLen + 4);  // the extra 4 bytes are the CRC
151   }
152 
153   return res;
154 };
155 
addMetadataToPNGStream(std::istream & inStream,const std::vector<std::pair<std::string,std::string>> & metadata,bool compressed)156 std::string addMetadataToPNGStream(
157     std::istream &inStream,
158     const std::vector<std::pair<std::string, std::string>> &metadata,
159     bool compressed) {
160 #ifndef RDK_USE_BOOST_IOSTREAMS
161   compressed = false;
162 #endif
163   // confirm that it's a PNG file:
164   if (!checkPNGHeader(inStream)) {
165     throw FileParseException("PNG header not recognized");
166   }
167   std::stringstream res;
168   // write the header
169   for (auto byte : pngHeader) {
170     res << byte;
171   }
172 
173   // copy over everything up to IEND
174   bool foundEnd = false;
175   std::uint32_t finalCRC;
176   while (inStream) {
177     std::uint32_t blockLen;
178     inStream.read((char *)&blockLen, sizeof(blockLen));
179     char bytes[4];
180     inStream.read(bytes, 4);
181     if (bytes[0] == 'I' && bytes[1] == 'E' && bytes[2] == 'N' &&
182         bytes[3] == 'D') {
183       foundEnd = true;
184       inStream.read((char *)&finalCRC, sizeof(finalCRC));
185       break;
186     }
187     res.write((char *)&blockLen, sizeof(blockLen));
188     res.write(bytes, 4);
189     // PNG is big endian, make sure we handle the order correctly
190     blockLen = EndianSwapBytes<BIG_ENDIAN_ORDER, HOST_ENDIAN_ORDER>(blockLen);
191     std::string block(blockLen + 4, 0);
192     inStream.read((char *)&block.front(),
193                   blockLen + 4);  // the extra 4 bytes are the CRC
194     res.write(block.c_str(), blockLen + 4);
195   }
196   if (!foundEnd) {
197     throw FileParseException("did not find IEND block in PNG");
198   }
199 
200   // write out the metadata:
201   for (const auto &pr : metadata) {
202     std::stringstream blk;
203     if (!compressed) {
204       blk.write("tEXt", 4);
205       // write the name along with a zero
206       blk.write(pr.first.c_str(), pr.first.size() + 1);
207       blk.write(pr.second.c_str(), pr.second.size());
208     } else {
209 #ifdef RDK_USE_BOOST_IOSTREAMS
210       blk.write("zTXt", 4);
211       // write the name along with a zero
212       blk.write(pr.first.c_str(), pr.first.size() + 1);
213       // write the compressed data
214       // first a zero for the "compression method":
215       blk.write("\0", 1);
216       auto dest = compressString(pr.second);
217       blk.write((const char *)dest.c_str(), dest.size());
218 #else
219       // we shouldn't get here since we disabled compressed at the beginning of
220       // the function, but check to be sure
221       CHECK_INVARIANT(0, "compression support not enabled");
222 #endif
223     }
224     auto blob = blk.str();
225     std::uint32_t blksize =
226         blob.size() - 4;  // we don't include the tag in the size;
227     boost::crc_32_type crc;
228     crc.process_bytes((void const *)blob.c_str(), blob.size());
229     std::uint32_t crcVal = crc.checksum();
230     // PNG is big endian, make sure we handle the order correctly
231     blksize = EndianSwapBytes<HOST_ENDIAN_ORDER, BIG_ENDIAN_ORDER>(blksize);
232 
233     res.write((char *)&blksize, sizeof(blksize));
234     res.write(blob.c_str(), blob.size());
235     // PNG is big endian, make sure we handle the order correctly
236     crcVal = EndianSwapBytes<HOST_ENDIAN_ORDER, BIG_ENDIAN_ORDER>(crcVal);
237     res.write((char *)&crcVal, sizeof(crcVal));
238   }
239 
240   // write out the IEND block
241   std::uint32_t blksize = 0;
242   res.write((char *)&blksize, sizeof(blksize));
243 
244   const char *endTag = "IEND";
245   res.write(endTag, 4);
246   res.write((char *)&finalCRC, sizeof(finalCRC));
247   return res.str();
248 }
249 
addMolToPNGStream(const ROMol & mol,std::istream & iStream,bool includePkl,bool includeSmiles,bool includeMol)250 std::string addMolToPNGStream(const ROMol &mol, std::istream &iStream,
251                               bool includePkl, bool includeSmiles,
252                               bool includeMol) {
253   std::vector<std::pair<std::string, std::string>> metadata;
254   if (includePkl) {
255     std::string pkl;
256     MolPickler::pickleMol(mol, pkl);
257     metadata.push_back(std::make_pair(augmentTagName(PNGData::pklTag), pkl));
258   }
259   if (includeSmiles) {
260     std::string smi = MolToCXSmiles(mol);
261     metadata.push_back(std::make_pair(augmentTagName(PNGData::smilesTag), smi));
262   }
263   if (includeMol) {
264     bool includeStereo = true;
265     int confId = -1;
266     bool kekulize = false;
267     std::string mb = MolToMolBlock(mol, includeStereo, confId, kekulize);
268     metadata.push_back(std::make_pair(augmentTagName(PNGData::molTag), mb));
269   }
270   return addMetadataToPNGStream(iStream, metadata);
271 };
272 
PNGStreamToMol(std::istream & inStream,const SmilesParserParams & params)273 ROMol *PNGStreamToMol(std::istream &inStream,
274                       const SmilesParserParams &params) {
275   ROMol *res = nullptr;
276   auto metadata = PNGStreamToMetadata(inStream);
277   bool formatFound = false;
278   for (const auto &pr : metadata) {
279     if (boost::starts_with(pr.first, PNGData::pklTag)) {
280       res = new ROMol(pr.second);
281       formatFound = true;
282     } else if (boost::starts_with(pr.first, PNGData::smilesTag)) {
283       res = SmilesToMol(pr.second, params);
284       formatFound = true;
285     } else if (boost::starts_with(pr.first, PNGData::molTag)) {
286       res = MolBlockToMol(pr.second, params.sanitize, params.removeHs);
287       formatFound = true;
288     }
289     if (formatFound) {
290       break;
291     }
292   }
293   if (!formatFound) {
294     throw FileParseException("No suitable metadata found.");
295   }
296   return res;
297 }
298 
PNGStreamToMols(std::istream & inStream,const std::string & tagToUse,const SmilesParserParams & params)299 std::vector<std::unique_ptr<ROMol>> PNGStreamToMols(
300     std::istream &inStream, const std::string &tagToUse,
301     const SmilesParserParams &params) {
302   std::vector<std::unique_ptr<ROMol>> res;
303   auto metadata = PNGStreamToMetadata(inStream);
304   for (const auto &pr : metadata) {
305     if (!boost::starts_with(pr.first, tagToUse)) {
306       continue;
307     }
308     if (boost::starts_with(pr.first, PNGData::pklTag)) {
309       res.emplace_back(new ROMol(pr.second));
310     } else if (boost::starts_with(pr.first, PNGData::smilesTag)) {
311       res.emplace_back(SmilesToMol(pr.second, params));
312     } else if (boost::starts_with(pr.first, PNGData::molTag)) {
313       res.emplace_back(
314           MolBlockToMol(pr.second, params.sanitize, params.removeHs));
315     }
316   }
317   return res;
318 }
319 
320 }  // namespace RDKit
321