1 // https://bitbucket.org/xwang/mdict-analysis 2 // https://github.com/zhansliu/writemdict/blob/master/fileformat.md 3 // Octopus MDict Dictionary File (.mdx) and Resource File (.mdd) Analyser 4 // 5 // Copyright (C) 2012, 2013 Xiaoqiang Wang <xiaoqiangwang AT gmail DOT com> 6 // Copyright (C) 2013 Timon Wong <timon86.wang AT gmail DOT com> 7 // Copyright (C) 2015 Zhe Wang <0x1998 AT gmail DOT com> 8 // 9 // This program is a free software; you can redistribute it and/or modify 10 // it under the terms of the GNU General Public License as published by 11 // the Free Software Foundation, version 3 of the License. 12 // 13 // You can get a copy of GNU General Public License along this program 14 // But you can always get it from http://www.gnu.org/licenses/gpl.txt 15 // 16 // This program is distributed in the hope that it will be useful, 17 // but WITHOUT ANY WARRANTY; without even the implied warranty of 18 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 // GNU General Public License for more details. 20 21 #ifndef __MDICTPARSER_HH_INCLUDED__ 22 #define __MDICTPARSER_HH_INCLUDED__ 23 24 #include <string> 25 #include <vector> 26 #include <map> 27 #include <utility> 28 29 #include <QPointer> 30 #include <QFile> 31 32 namespace Mdict 33 { 34 35 using std::string; 36 using std::vector; 37 using std::pair; 38 using std::map; 39 40 // A helper class to handle memory map for QFile 41 class ScopedMemMap 42 { 43 QFile & file; 44 uchar * address; 45 46 public: ScopedMemMap(QFile & file,qint64 offset,qint64 size)47 ScopedMemMap( QFile & file, qint64 offset, qint64 size ) : 48 file( file ), 49 address( file.map( offset, size ) ) 50 { 51 } 52 ~ScopedMemMap()53 ~ScopedMemMap() 54 { 55 if ( address ) 56 file.unmap( address ); 57 } 58 startAddress()59 inline uchar * startAddress() 60 { 61 return address; 62 } 63 }; 64 65 class MdictParser 66 { 67 public: 68 69 enum 70 { 71 kParserVersion = 0x000000d 72 }; 73 74 struct RecordIndex 75 { 76 qint64 startPos; 77 qint64 endPos; 78 qint64 shadowStartPos; 79 qint64 shadowEndPos; 80 qint64 compressedSize; 81 qint64 decompressedSize; 82 operator ==Mdict::MdictParser::RecordIndex83 inline bool operator==( qint64 rhs ) const 84 { 85 return ( shadowStartPos <= rhs ) && ( rhs < shadowEndPos ); 86 } 87 operator <Mdict::MdictParser::RecordIndex88 inline bool operator<( qint64 rhs ) const 89 { 90 return shadowEndPos <= rhs; 91 } 92 operator >Mdict::MdictParser::RecordIndex93 inline bool operator>( qint64 rhs ) const 94 { 95 return shadowStartPos > rhs; 96 } 97 98 static size_t bsearch( vector<RecordIndex> const & offsets, qint64 val ); 99 }; 100 101 struct RecordInfo 102 { 103 qint64 compressedBlockPos; 104 qint64 recordOffset; 105 106 qint64 decompressedBlockSize; 107 qint64 compressedBlockSize; 108 qint64 recordSize; 109 }; 110 111 class RecordHandler 112 { 113 public: 114 virtual void handleRecord( QString const & name, RecordInfo const & recordInfo ) = 0; 115 }; 116 117 typedef vector< pair<qint64, qint64> > BlockInfoVector; 118 typedef vector< pair<qint64, QString> > HeadWordIndex; 119 typedef map<qint32, pair<QString, QString> > StyleSheets; 120 title() const121 inline QString const & title() const 122 { 123 return title_; 124 } 125 description() const126 inline QString const & description() const 127 { 128 return description_; 129 } 130 styleSheets() const131 inline StyleSheets const & styleSheets() const 132 { 133 return styleSheets_; 134 } 135 wordCount() const136 inline quint32 wordCount() const 137 { 138 return wordCount_; 139 } 140 encoding() const141 inline QString const & encoding() const 142 { 143 return encoding_; 144 } 145 filename() const146 inline QString const & filename() const 147 { 148 return filename_; 149 } 150 isRightToLeft() const151 inline bool isRightToLeft() const 152 { 153 return rtl_; 154 } 155 156 MdictParser(); ~MdictParser()157 ~MdictParser() {} 158 159 bool open( const char * filename ); 160 bool readNextHeadWordIndex( HeadWordIndex & headWordIndex ); 161 bool readRecordBlock( HeadWordIndex & headWordIndex, RecordHandler & recordHandler ); 162 163 // helpers 164 static QString toUtf16( const char * fromCode, const char * from, size_t fromSize ); toUtf16(QString const & fromCode,const char * from,size_t fromSize)165 static inline QString toUtf16( QString const & fromCode, const char * from, size_t fromSize ) 166 { 167 return toUtf16( fromCode.toLatin1().constData(), from, fromSize ); 168 } 169 static bool parseCompressedBlock( qint64 compressedBlockSize, const char * compressedBlockPtr, 170 qint64 decompressedBlockSize, QByteArray & decompressedBlock); 171 static QString & substituteStylesheet( QString & article, StyleSheets const & styleSheets ); substituteStylesheet(string const & article,StyleSheets const & styleSheets)172 static inline string substituteStylesheet( string const & article, StyleSheets const & styleSheets ) 173 { 174 QString s = QString::fromUtf8( article.c_str() ); 175 substituteStylesheet( s, styleSheets ); 176 return string( s.toUtf8().constData() ); 177 } 178 179 protected: 180 qint64 readNumber( QDataStream & in ); 181 static quint32 readU8OrU16( QDataStream & in, bool isU16 ); 182 static bool checkAdler32(const char * buffer, unsigned int len, quint32 checksum); 183 static bool decryptHeadWordIndex(char * buffer, qint64 len); 184 bool readHeader( QDataStream & in ); 185 bool readHeadWordBlockInfos( QDataStream & in ); 186 bool readRecordBlockInfos(); 187 BlockInfoVector decodeHeadWordBlockInfo( QByteArray const & headWordBlockInfo ); 188 HeadWordIndex splitHeadWordBlock( QByteArray const & block ); 189 190 protected: 191 QString filename_; 192 QPointer<QFile> file_; 193 StyleSheets styleSheets_; 194 BlockInfoVector headWordBlockInfos_; 195 BlockInfoVector::iterator headWordBlockInfosIter_; 196 vector<RecordIndex> recordBlockInfos_; 197 198 QString encoding_; 199 QString title_; 200 QString description_; 201 202 double version_; 203 qint64 numHeadWordBlocks_; 204 qint64 headWordBlockInfoSize_; 205 qint64 headWordBlockSize_; 206 qint64 headWordBlockInfoPos_; 207 qint64 headWordPos_; 208 qint64 totalRecordsSize_; 209 qint64 recordPos_; 210 211 quint32 wordCount_; 212 int numberTypeSize_; 213 int encrypted_; 214 bool rtl_; 215 }; 216 217 } 218 219 #endif // __MDICTPARSER_HH_INCLUDED__ 220