1 // https://bitbucket.org/xwang/mdict-analysis
2 // https://github.com/zhansliu/writemdict/blob/master/fileformat.md
3 // Octopus MDict Dictionary File (.mdx) and Resource File (.mdd) Analyser
4 //
5 // Copyright (C) 2012, 2013 Xiaoqiang Wang <xiaoqiangwang AT gmail DOT com>
6 // Copyright (C) 2013 Timon Wong <timon86.wang AT gmail DOT com>
7 // Copyright (C) 2015 Zhe Wang <0x1998 AT gmail DOT com>
8 //
9 // This program is a free software; you can redistribute it and/or modify
10 // it under the terms of the GNU General Public License as published by
11 // the Free Software Foundation, version 3 of the License.
12 //
13 // You can get a copy of GNU General Public License along this program
14 // But you can always get it from http://www.gnu.org/licenses/gpl.txt
15 //
16 // This program is distributed in the hope that it will be useful,
17 // but WITHOUT ANY WARRANTY; without even the implied warranty of
18 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 // GNU General Public License for more details.
20 
21 #ifndef __MDICTPARSER_HH_INCLUDED__
22 #define __MDICTPARSER_HH_INCLUDED__
23 
24 #include <string>
25 #include <vector>
26 #include <map>
27 #include <utility>
28 
29 #include <QPointer>
30 #include <QFile>
31 
32 namespace Mdict
33 {
34 
35 using std::string;
36 using std::vector;
37 using std::pair;
38 using std::map;
39 
40 // A helper class to handle memory map for QFile
41 class ScopedMemMap
42 {
43   QFile & file;
44   uchar * address;
45 
46 public:
ScopedMemMap(QFile & file,qint64 offset,qint64 size)47   ScopedMemMap( QFile & file, qint64 offset, qint64 size ) :
48     file( file ),
49     address( file.map( offset, size ) )
50   {
51   }
52 
~ScopedMemMap()53   ~ScopedMemMap()
54   {
55     if ( address )
56       file.unmap( address );
57   }
58 
startAddress()59   inline uchar * startAddress()
60   {
61     return address;
62   }
63 };
64 
65 class MdictParser
66 {
67 public:
68 
69   enum
70   {
71     kParserVersion = 0x000000d
72   };
73 
74   struct RecordIndex
75   {
76     qint64 startPos;
77     qint64 endPos;
78     qint64 shadowStartPos;
79     qint64 shadowEndPos;
80     qint64 compressedSize;
81     qint64 decompressedSize;
82 
operator ==Mdict::MdictParser::RecordIndex83     inline bool operator==( qint64 rhs ) const
84     {
85       return ( shadowStartPos <= rhs ) && ( rhs < shadowEndPos );
86     }
87 
operator <Mdict::MdictParser::RecordIndex88     inline bool operator<( qint64 rhs ) const
89     {
90       return shadowEndPos <= rhs;
91     }
92 
operator >Mdict::MdictParser::RecordIndex93     inline bool operator>( qint64 rhs ) const
94     {
95       return shadowStartPos > rhs;
96     }
97 
98     static size_t bsearch( vector<RecordIndex> const & offsets, qint64 val );
99   };
100 
101   struct RecordInfo
102   {
103     qint64 compressedBlockPos;
104     qint64 recordOffset;
105 
106     qint64 decompressedBlockSize;
107     qint64 compressedBlockSize;
108     qint64 recordSize;
109   };
110 
111   class RecordHandler
112   {
113   public:
114     virtual void handleRecord( QString const & name, RecordInfo const & recordInfo ) = 0;
115   };
116 
117   typedef vector< pair<qint64, qint64> > BlockInfoVector;
118   typedef vector< pair<qint64, QString> > HeadWordIndex;
119   typedef map<qint32, pair<QString, QString> > StyleSheets;
120 
title() const121   inline QString const & title() const
122   {
123     return title_;
124   }
125 
description() const126   inline QString const & description() const
127   {
128     return description_;
129   }
130 
styleSheets() const131   inline StyleSheets const & styleSheets() const
132   {
133     return styleSheets_;
134   }
135 
wordCount() const136   inline quint32 wordCount() const
137   {
138     return wordCount_;
139   }
140 
encoding() const141   inline QString const & encoding() const
142   {
143     return encoding_;
144   }
145 
filename() const146   inline QString const & filename() const
147   {
148     return filename_;
149   }
150 
isRightToLeft() const151   inline bool isRightToLeft() const
152   {
153     return rtl_;
154   }
155 
156   MdictParser();
~MdictParser()157   ~MdictParser() {}
158 
159   bool open( const char * filename );
160   bool readNextHeadWordIndex( HeadWordIndex & headWordIndex );
161   bool readRecordBlock( HeadWordIndex & headWordIndex, RecordHandler & recordHandler );
162 
163   // helpers
164   static QString toUtf16( const char * fromCode, const char * from, size_t fromSize );
toUtf16(QString const & fromCode,const char * from,size_t fromSize)165   static inline QString toUtf16( QString const & fromCode, const char * from, size_t fromSize )
166   {
167     return toUtf16( fromCode.toLatin1().constData(), from, fromSize );
168   }
169   static bool parseCompressedBlock( qint64 compressedBlockSize, const char * compressedBlockPtr,
170                                     qint64 decompressedBlockSize, QByteArray & decompressedBlock);
171   static QString & substituteStylesheet( QString & article, StyleSheets const & styleSheets );
substituteStylesheet(string const & article,StyleSheets const & styleSheets)172   static inline string substituteStylesheet( string const & article, StyleSheets const & styleSheets )
173   {
174     QString s = QString::fromUtf8( article.c_str() );
175     substituteStylesheet( s, styleSheets );
176     return string( s.toUtf8().constData() );
177   }
178 
179 protected:
180   qint64 readNumber( QDataStream & in );
181   static quint32 readU8OrU16( QDataStream & in, bool isU16 );
182   static bool checkAdler32(const char * buffer, unsigned int len, quint32 checksum);
183   static bool decryptHeadWordIndex(char * buffer, qint64 len);
184   bool readHeader( QDataStream & in );
185   bool readHeadWordBlockInfos( QDataStream & in );
186   bool readRecordBlockInfos();
187   BlockInfoVector decodeHeadWordBlockInfo( QByteArray const & headWordBlockInfo );
188   HeadWordIndex splitHeadWordBlock( QByteArray const & block );
189 
190 protected:
191   QString filename_;
192   QPointer<QFile> file_;
193   StyleSheets styleSheets_;
194   BlockInfoVector headWordBlockInfos_;
195   BlockInfoVector::iterator headWordBlockInfosIter_;
196   vector<RecordIndex> recordBlockInfos_;
197 
198   QString encoding_;
199   QString title_;
200   QString description_;
201 
202   double version_;
203   qint64 numHeadWordBlocks_;
204   qint64 headWordBlockInfoSize_;
205   qint64 headWordBlockSize_;
206   qint64 headWordBlockInfoPos_;
207   qint64 headWordPos_;
208   qint64 totalRecordsSize_;
209   qint64 recordPos_;
210 
211   quint32 wordCount_;
212   int numberTypeSize_;
213   int encrypted_;
214   bool rtl_;
215 };
216 
217 }
218 
219 #endif // __MDICTPARSER_HH_INCLUDED__
220