1 /*
2     This file is part of Kiten, a KDE Japanese Reference Tool
3     SPDX-FileCopyrightText: 2001 Jason Katz-Brown <jason@katzbrown.com>
4     SPDX-FileCopyrightText: 2006 Joseph Kerian <jkerian@gmail.com>
5     SPDX-FileCopyrightText: 2006 Eric Kjeldergaard <kjelderg@gmail.com>
6     SPDX-FileCopyrightText: 2011 Daniel E. Moctezuma <democtezuma@gmail.com>
7 
8     SPDX-License-Identifier: LGPL-2.0-or-later
9 */
10 
11 #include "entrykanjidic.h"
12 
13 #include "dictfilekanjidic.h"
14 #include "kitenmacros.h"
15 
16 #include <KLocalizedString>
17 #include <QDebug>
18 
19 #define QSTRINGLISTCHECK(x) (x==NULL?QStringList():*x)
20 
EntryKanjidic(const EntryKanjidic & dict)21 EntryKanjidic::EntryKanjidic( const EntryKanjidic &dict )
22 : Entry( dict )
23 {
24 }
25 
EntryKanjidic(const QString & dict)26 EntryKanjidic::EntryKanjidic( const QString &dict )
27 : Entry( dict )
28 {
29 }
30 
EntryKanjidic(const QString & dict,const QString & entry)31 EntryKanjidic::EntryKanjidic( const QString &dict, const QString &entry )
32 : Entry( dict )
33 {
34   loadEntry( entry );
35 }
36 
addReadings(const QStringList & list) const37 QString EntryKanjidic::addReadings( const QStringList &list ) const
38 {
39   QString readings;
40   foreach( const QString &reading, list )
41   {
42     readings.append( makeReadingLink( reading ) + outputListDelimiter );
43   }
44 
45   return readings;
46 }
47 
clone() const48 Entry* EntryKanjidic::clone() const
49 {
50   return new EntryKanjidic( *this );
51 }
52 
53 /**
54  * This reproduces a kanjidic-formatted line from the Entry.
55  *  Look at the above parser to see how the format works.
56  */
dumpEntry() const57 QString EntryKanjidic::dumpEntry() const
58 {
59   /* Loop over the ExtendedInfo to add it to the line we produce */
60   QString dumpExtendedInfo;
61   QHash<QString,QString>::const_iterator it;
62   for( it = ExtendedInfo.constBegin(); it != ExtendedInfo.constEnd(); ++it )
63   {
64     dumpExtendedInfo += ' ' + it.key() + it.value();
65   }
66 
67   return QStringLiteral( "%1 %2%3" ).arg( Word )
68                              .arg( Readings.join( QLatin1Char(' ') ) )
69                              .arg( dumpExtendedInfo );
70 }
71 
extendedItemCheck(const QString & key,const QString & value) const72 bool EntryKanjidic::extendedItemCheck( const QString &key, const QString &value ) const
73 {
74   if( key == QLatin1String("common") )
75   {
76     return ! getExtendedInfoItem( QStringLiteral("G") ).isEmpty();
77   }
78 
79   return Entry::extendedItemCheck( key, value );
80 }
81 
getAsRadicalReadings() const82 QString EntryKanjidic::getAsRadicalReadings() const
83 {
84   return AsRadicalReadings.join( outputListDelimiter );
85 }
86 
getAsRadicalReadingsList() const87 QStringList EntryKanjidic::getAsRadicalReadingsList() const
88 {
89   return AsRadicalReadings;
90 }
91 
getDictionaryType() const92 QString EntryKanjidic::getDictionaryType() const
93 {
94   return KANJIDIC;
95 }
96 
getInNamesReadings() const97 QString EntryKanjidic::getInNamesReadings() const
98 {
99   return InNamesReadings.join( outputListDelimiter );
100 }
101 
getInNamesReadingsList() const102 QStringList EntryKanjidic::getInNamesReadingsList() const
103 {
104   return InNamesReadings;
105 }
106 
getKanjiGrade() const107 QString EntryKanjidic::getKanjiGrade() const
108 {
109   return getExtendedInfoItem( QStringLiteral("G") );
110 }
111 
getKunyomiReadings() const112 QString EntryKanjidic::getKunyomiReadings() const
113 {
114   return KunyomiReadings.join( outputListDelimiter );
115 }
116 
getKunyomiReadingsList() const117 QStringList EntryKanjidic::getKunyomiReadingsList() const
118 {
119   return KunyomiReadings;
120 }
121 
getOnyomiReadings() const122 QString EntryKanjidic::getOnyomiReadings() const
123 {
124   return OnyomiReadings.join( outputListDelimiter );
125 }
126 
getOnyomiReadingsList() const127 QStringList EntryKanjidic::getOnyomiReadingsList() const
128 {
129   return OnyomiReadings;
130 }
131 
getStrokesCount() const132 QString EntryKanjidic::getStrokesCount() const
133 {
134   return getExtendedInfoItem( QStringLiteral("S") );
135 }
136 
HTMLExtendedInfo(const QString & field) const137 QString EntryKanjidic::HTMLExtendedInfo( const QString &field ) const
138 {
139   //qDebug() << field;
140   return QStringLiteral( "<span class=\"ExtendedInfo\">%1: %2</span>" )
141              .arg( field )
142              .arg( ExtendedInfo[ field ] );
143 }
144 
145 /**
146  * Prepares Readings for output as HTML
147  */
HTMLReadings() const148 QString EntryKanjidic::HTMLReadings() const
149 {
150   QString htmlReadings;
151   htmlReadings += addReadings( originalReadings );
152 
153   if( InNamesReadings.count() > 0 )
154   {
155     htmlReadings += i18n( "In names: " );
156     htmlReadings += addReadings( InNamesReadings );
157   }
158 
159   if( AsRadicalReadings.count() > 0 )
160   {
161     htmlReadings += i18n( "As radical: " );
162     htmlReadings += addReadings( AsRadicalReadings );
163   }
164 
165   // get rid of last ,
166   htmlReadings.truncate( htmlReadings.length() - outputListDelimiter.length() );
167   return QStringLiteral( "<span class=\"Readings\">%1</span>" ).arg( htmlReadings );
168 }
169 
HTMLWord() const170 QString EntryKanjidic::HTMLWord() const
171 {
172   return QStringLiteral( "<span class=\"Word\">%1</span>" ).arg( makeLink( Word ) );
173 }
174 
175 /**
176  * Fill the fields of our Entry object appropriate to the given
177  * entry line from Kanjidic.
178  */
179 /* TODO: Error checking */
loadEntry(const QString & entryLine)180 bool EntryKanjidic::loadEntry( const QString &entryLine )
181 {
182   unsigned int length = entryLine.length();
183 
184   /* The loop would be a bit faster if we first grabbed the kanji (2 bytes) and then the
185      space that follows, etc. for the fixed-space portion of the entries let's try that.
186      First the first 2 bytes are guaranteed to be our kanji.  The 3rd byte is a space.
187      The 4th through 7th are an ascii representation of the JIS code.  One more space
188      Currently, kana are not detected so readings are anything that is not otherwise
189      in the 8th position. */
190   Word = entryLine.left( 1 );
191   //	QString strjis = raw.mid( 2, 4 );
192 
193   /* variables for the loop */
194   QChar ichar;
195   QString curString;
196 
197   /* we would need to do these exact things ... many times so here now. */
198   #define INCI if(i < length) \
199   { \
200     i++; \
201     ichar = entryLine.at(i); \
202   }
203   #define LOADSTRING(stringToLoad) while(entryLine.at(i) != ' ') \
204   { \
205     stringToLoad += entryLine.at(i); \
206     if(i < length) i++; \
207     else break; \
208   }
209 
210   //	qDebug() << "LOADSTRING: '" << stringToLoad << "'";
211 
212   /* We can start looping at 8 because we have guarantees about the initial
213      data.  This loop is used because the kanjidic format allows the data
214      to be in any order until the end of the line.  The format was designed
215      such that the data can be identified by the first byte. */
216   for ( unsigned int i = 7; i < length - 1; i++ )
217   {
218       ichar = entryLine.at( i );
219 
220       curString = QLatin1String("");
221       switch( ichar.unicode() )
222       {
223         case ' ':
224           /* as far as I can tell, there is no real rule forcing only 1 space so
225                   there's not really any significance to them.  This block is not
226                   reached in kanjidic itself. */
227           break;
228         case 'B':
229           /* the radical, or busyu, number */
230         case 'C':
231           /* the classical radical number, usually doesn't differ from busyu number */
232         case 'E':
233           /* Henshell's "A Guide To Remembering Japanese Characters" index number */
234         case 'F':
235           /* frequency ranking */
236         case 'G':
237           /* grade level Jouyou 1 - 6 or 8 for common use or 9 for Jinmeiyou */
238         case 'H':
239           /* number from Halpern's New Japanese-English Character Dictionary */
240         case 'K':
241           /* Gakken Kanji Dictionary index */
242         case 'L':
243           /* Heisig's "Remembering The Kanji" index */
244         case 'N':
245           /* number from Nelson's Modern Reader's Japanese-English Character Dictionary */
246         case 'O':
247           /* O'Neill's "Japanese Names" index number */
248         case 'P':
249           /* SKIP code ... #-#-# format */
250         case 'Q':
251           /* Four Corner codes, it seems, can be multiple though I'm tempted just to take the last one. */
252         case 'U':
253           /* unicode which we are ignoring as it is found in another way */
254         case 'V':
255           /* number from Haig's New Nelson Japanese-English Character Dictionary */
256         case 'W':
257           /* korean reading */
258         case 'X':
259           /* I don't entirely understand this field. */
260         case 'Y':
261           /* Pinyin reading */
262         case 'Z':
263           /* SKIP misclassifications */
264 
265           /* All of the above are of the format <Char><Data> where <Char> is
266                   exactly 1 character. */
267           i++;
268           LOADSTRING( curString );
269           ExtendedInfo.insert( QString( ichar ), curString );
270           break;
271         case 'I':
272           /* index codes for Spahn & Hadamitzky reference books we need the next
273                   char to know what to do with it. */
274           INCI
275           if( ichar == 'N' )
276           {
277             /* a Kanji & Kana book number */
278             LOADSTRING( curString )
279           }
280           else
281           {
282             /* The Kanji Dictionary number, we need the current ichar. */
283             LOADSTRING( curString )
284           }
285           ExtendedInfo.insert( 'I' + QString( ichar ), curString );
286           break;
287         case 'M':
288           /* index and page numbers for Morohashi's Daikanwajiten 2 fields possible */
289           INCI
290           if( ichar == 'N' )
291           {
292             LOADSTRING( curString )
293             /* index number */
294           }
295           else if( ichar == 'P' )
296           {
297             LOADSTRING( curString )
298             /* page number in volume.page format */
299           }
300           ExtendedInfo.insert( 'M' + QString( ichar ), curString );
301           break;
302         case 'S':
303           /* stroke count: may be multiple.  In that case, first is actual, others common
304                   miscounts */
305           i++;
306           if( ! ExtendedInfo.contains(QStringLiteral("S") ) )
307           {
308             LOADSTRING( curString )
309             ExtendedInfo.insert( QString( ichar ), curString );
310           }
311           else
312           {
313             LOADSTRING( curString )
314             ExtendedInfo.insert( '_' + QString( ichar ), curString );
315           }
316           break;
317         case 'D':
318           /* dictionary codes */
319           INCI
320           LOADSTRING( curString )
321           ExtendedInfo.insert( 'D' + QString( ichar ), curString );
322           break;
323         case '{':
324           /* This should be starting with the first '{' character of a meaning section.
325                   Let us get take it to the last. */
326           INCI
327           while( ichar != '}' )
328           {
329             curString += ichar;
330             /* sanity */
331             if( i < length )
332             {
333               i++;
334             }
335             else
336             {
337               break;
338             }
339             ichar = entryLine.at( i );
340           }
341           INCI
342 //           qDebug() << "Meaning's curString: '" << curString << "'";
343           Meanings.append( curString );
344           break;
345         case 'T': /* a reading that is used in names for T1, radical names for T2 */
346         {
347           i++;
348           LOADSTRING( curString )
349           // Get the type number (1 for T1, 2 for T2).
350           int type = curString.toInt();
351           bool finished = false;
352           while( ! finished )
353           {
354             // Skip all whitespaces.
355             INCI
356             while( ichar == ' ' )
357             {
358               INCI
359             }
360             // Check if the current character is Kana.
361             if( 0x3040 <= ichar.unicode() && ichar.unicode() <= 0x30FF )
362             {
363               // Reset our variable and load it with
364               // all available kana until we find a whitespace.
365               curString = QLatin1String("");
366               LOADSTRING( curString );
367               switch( type )
368               {
369                 case 1: // Special reading used in names.
370                   InNamesReadings.append( curString );
371                   break;
372                 case 2: // Reading as radical.
373                   AsRadicalReadings.append( curString );
374                   break;
375               }
376             }
377             else
378             {
379               // There are not more kana characters,
380               // so we finish this loop for now.
381               finished = true;
382             }
383           }
384           // Now 'i' points to a '{' character. We decrease its value
385           // so in the next loop we can reach the "case '{'" section.
386           i--;
387         }
388         break;
389         case '-':
390           /* a reading that is only in postposition */
391           /* any of those 2 signals a reading is to ensue. */
392           LOADSTRING( curString )
393           originalReadings.append( curString );
394 
395           // If it is Hiragana (Kunyomi)
396           if( 0x3040 <= ichar.unicode() && ichar.unicode() <= 0x309F )
397           {
398             KunyomiReadings.append( curString );
399           }
400           // If it is Katakana (Onyomi)
401           else if( 0x30A0 <= ichar.unicode() && ichar.unicode() <= 0x30FF )
402           {
403             OnyomiReadings.append( curString );
404           }
405 
406           curString = curString.remove( '-' ).remove( '.' );
407           Readings.append( curString );
408           break;
409         default:
410           /* either a character we don't address or a problem...we should ignore it */
411 // 	  qDebug() << "hit default in kanji parser.  Unicode: '" << ichar.unicode() << "'";
412 
413           /* This should detect unicode kana */
414           // Hiragana 0x3040 - 0x309F, Katakana: 0x30A0 - 0x30FF
415           if( 0x3040 <= ichar.unicode() && ichar.unicode() <= 0x30FF )
416           {
417             LOADSTRING( curString )
418             originalReadings.append( curString );
419 
420             // If it is Hiragana (Kunyomi)
421             if( 0x3040 <=ichar.unicode() && ichar.unicode() <= 0x309F )
422             {
423               KunyomiReadings.append( curString );
424             }
425             // If it is Katakana (Onyomi)
426             else if( 0x30A0 <= ichar.unicode() && ichar.unicode() <= 0x30FF )
427             {
428               OnyomiReadings.append( curString );
429             }
430 
431             curString = curString.remove( '-' ).remove( '.' );
432             Readings.append( curString );
433             break;
434           }
435           /* if it's not a kana reading ... it is something unhandled ...
436              possibly a new field in kanjidic.  Let's treat it as the
437              oh-so-common <char><data> type of entry.  It could be hotly
438              debated what we should actually do about these. */
439           i++;
440           LOADSTRING( curString );
441           ExtendedInfo.insert( QString( ichar ), curString );
442 
443           break;
444       }
445   }
446 //   qDebug() << "Parsed: '"<<Word<<"' ("<<Readings.join("^")<<") \""<<
447 //   Meanings.join("|")<<"\ and " <<ExtendedInfo.keys() << " from :"<<entryLine<<endl;
448 
449   return true;
450 }
451 
makeReadingLink(const QString & inReading) const452 QString EntryKanjidic::makeReadingLink( const QString &inReading ) const
453 {
454   QString reading = inReading;
455   return QStringLiteral( "<a href=\"%1\">%2</a>" ).arg( reading.remove( '.' ).remove( '-' ) )
456                                            .arg( inReading );
457 }
458 
459 /**
460  * Returns a HTML version of an Entry
461  */
toHTML() const462 QString EntryKanjidic::toHTML() const
463 {
464   QString result = QStringLiteral("<div class=\"KanjidicBrief\">");
465 
466   foreach( const QString &field, QSTRINGLISTCHECK( DictFileKanjidic::displayFields ) )
467   {
468     //qDebug() << "Display: "<<field;
469     if( field == QLatin1String("--NewLine--") )              result += QLatin1String("<br>");
470     else if( field == QLatin1String("Word/Kanji") )          result += HTMLWord() + ' ';
471     else if( field == QLatin1String("Meaning") )             result += HTMLMeanings() + ' ';
472     else if( field == QLatin1String("Reading") )             result += HTMLReadings() + ' ';
473     else if( ExtendedInfo.contains( field ) ) result += HTMLExtendedInfo( field ) + ' ';
474   }
475 
476   result += QLatin1String("</div>");
477   return result;
478 }
479