1 /*
2 This file is part of Kiten, a KDE Japanese Reference Tool
3 SPDX-FileCopyrightText: 2001 Jason Katz-Brown <jason@katzbrown.com>
4 SPDX-FileCopyrightText: 2006 Joseph Kerian <jkerian@gmail.com>
5 SPDX-FileCopyrightText: 2006 Eric Kjeldergaard <kjelderg@gmail.com>
6 SPDX-FileCopyrightText: 2011 Daniel E. Moctezuma <democtezuma@gmail.com>
7
8 SPDX-License-Identifier: LGPL-2.0-or-later
9 */
10
11 #include "entrykanjidic.h"
12
13 #include "dictfilekanjidic.h"
14 #include "kitenmacros.h"
15
16 #include <KLocalizedString>
17 #include <QDebug>
18
19 #define QSTRINGLISTCHECK(x) (x==NULL?QStringList():*x)
20
EntryKanjidic(const EntryKanjidic & dict)21 EntryKanjidic::EntryKanjidic( const EntryKanjidic &dict )
22 : Entry( dict )
23 {
24 }
25
EntryKanjidic(const QString & dict)26 EntryKanjidic::EntryKanjidic( const QString &dict )
27 : Entry( dict )
28 {
29 }
30
EntryKanjidic(const QString & dict,const QString & entry)31 EntryKanjidic::EntryKanjidic( const QString &dict, const QString &entry )
32 : Entry( dict )
33 {
34 loadEntry( entry );
35 }
36
addReadings(const QStringList & list) const37 QString EntryKanjidic::addReadings( const QStringList &list ) const
38 {
39 QString readings;
40 foreach( const QString &reading, list )
41 {
42 readings.append( makeReadingLink( reading ) + outputListDelimiter );
43 }
44
45 return readings;
46 }
47
clone() const48 Entry* EntryKanjidic::clone() const
49 {
50 return new EntryKanjidic( *this );
51 }
52
53 /**
54 * This reproduces a kanjidic-formatted line from the Entry.
55 * Look at the above parser to see how the format works.
56 */
dumpEntry() const57 QString EntryKanjidic::dumpEntry() const
58 {
59 /* Loop over the ExtendedInfo to add it to the line we produce */
60 QString dumpExtendedInfo;
61 QHash<QString,QString>::const_iterator it;
62 for( it = ExtendedInfo.constBegin(); it != ExtendedInfo.constEnd(); ++it )
63 {
64 dumpExtendedInfo += ' ' + it.key() + it.value();
65 }
66
67 return QStringLiteral( "%1 %2%3" ).arg( Word )
68 .arg( Readings.join( QLatin1Char(' ') ) )
69 .arg( dumpExtendedInfo );
70 }
71
extendedItemCheck(const QString & key,const QString & value) const72 bool EntryKanjidic::extendedItemCheck( const QString &key, const QString &value ) const
73 {
74 if( key == QLatin1String("common") )
75 {
76 return ! getExtendedInfoItem( QStringLiteral("G") ).isEmpty();
77 }
78
79 return Entry::extendedItemCheck( key, value );
80 }
81
getAsRadicalReadings() const82 QString EntryKanjidic::getAsRadicalReadings() const
83 {
84 return AsRadicalReadings.join( outputListDelimiter );
85 }
86
getAsRadicalReadingsList() const87 QStringList EntryKanjidic::getAsRadicalReadingsList() const
88 {
89 return AsRadicalReadings;
90 }
91
getDictionaryType() const92 QString EntryKanjidic::getDictionaryType() const
93 {
94 return KANJIDIC;
95 }
96
getInNamesReadings() const97 QString EntryKanjidic::getInNamesReadings() const
98 {
99 return InNamesReadings.join( outputListDelimiter );
100 }
101
getInNamesReadingsList() const102 QStringList EntryKanjidic::getInNamesReadingsList() const
103 {
104 return InNamesReadings;
105 }
106
getKanjiGrade() const107 QString EntryKanjidic::getKanjiGrade() const
108 {
109 return getExtendedInfoItem( QStringLiteral("G") );
110 }
111
getKunyomiReadings() const112 QString EntryKanjidic::getKunyomiReadings() const
113 {
114 return KunyomiReadings.join( outputListDelimiter );
115 }
116
getKunyomiReadingsList() const117 QStringList EntryKanjidic::getKunyomiReadingsList() const
118 {
119 return KunyomiReadings;
120 }
121
getOnyomiReadings() const122 QString EntryKanjidic::getOnyomiReadings() const
123 {
124 return OnyomiReadings.join( outputListDelimiter );
125 }
126
getOnyomiReadingsList() const127 QStringList EntryKanjidic::getOnyomiReadingsList() const
128 {
129 return OnyomiReadings;
130 }
131
getStrokesCount() const132 QString EntryKanjidic::getStrokesCount() const
133 {
134 return getExtendedInfoItem( QStringLiteral("S") );
135 }
136
HTMLExtendedInfo(const QString & field) const137 QString EntryKanjidic::HTMLExtendedInfo( const QString &field ) const
138 {
139 //qDebug() << field;
140 return QStringLiteral( "<span class=\"ExtendedInfo\">%1: %2</span>" )
141 .arg( field )
142 .arg( ExtendedInfo[ field ] );
143 }
144
145 /**
146 * Prepares Readings for output as HTML
147 */
HTMLReadings() const148 QString EntryKanjidic::HTMLReadings() const
149 {
150 QString htmlReadings;
151 htmlReadings += addReadings( originalReadings );
152
153 if( InNamesReadings.count() > 0 )
154 {
155 htmlReadings += i18n( "In names: " );
156 htmlReadings += addReadings( InNamesReadings );
157 }
158
159 if( AsRadicalReadings.count() > 0 )
160 {
161 htmlReadings += i18n( "As radical: " );
162 htmlReadings += addReadings( AsRadicalReadings );
163 }
164
165 // get rid of last ,
166 htmlReadings.truncate( htmlReadings.length() - outputListDelimiter.length() );
167 return QStringLiteral( "<span class=\"Readings\">%1</span>" ).arg( htmlReadings );
168 }
169
HTMLWord() const170 QString EntryKanjidic::HTMLWord() const
171 {
172 return QStringLiteral( "<span class=\"Word\">%1</span>" ).arg( makeLink( Word ) );
173 }
174
175 /**
176 * Fill the fields of our Entry object appropriate to the given
177 * entry line from Kanjidic.
178 */
179 /* TODO: Error checking */
loadEntry(const QString & entryLine)180 bool EntryKanjidic::loadEntry( const QString &entryLine )
181 {
182 unsigned int length = entryLine.length();
183
184 /* The loop would be a bit faster if we first grabbed the kanji (2 bytes) and then the
185 space that follows, etc. for the fixed-space portion of the entries let's try that.
186 First the first 2 bytes are guaranteed to be our kanji. The 3rd byte is a space.
187 The 4th through 7th are an ascii representation of the JIS code. One more space
188 Currently, kana are not detected so readings are anything that is not otherwise
189 in the 8th position. */
190 Word = entryLine.left( 1 );
191 // QString strjis = raw.mid( 2, 4 );
192
193 /* variables for the loop */
194 QChar ichar;
195 QString curString;
196
197 /* we would need to do these exact things ... many times so here now. */
198 #define INCI if(i < length) \
199 { \
200 i++; \
201 ichar = entryLine.at(i); \
202 }
203 #define LOADSTRING(stringToLoad) while(entryLine.at(i) != ' ') \
204 { \
205 stringToLoad += entryLine.at(i); \
206 if(i < length) i++; \
207 else break; \
208 }
209
210 // qDebug() << "LOADSTRING: '" << stringToLoad << "'";
211
212 /* We can start looping at 8 because we have guarantees about the initial
213 data. This loop is used because the kanjidic format allows the data
214 to be in any order until the end of the line. The format was designed
215 such that the data can be identified by the first byte. */
216 for ( unsigned int i = 7; i < length - 1; i++ )
217 {
218 ichar = entryLine.at( i );
219
220 curString = QLatin1String("");
221 switch( ichar.unicode() )
222 {
223 case ' ':
224 /* as far as I can tell, there is no real rule forcing only 1 space so
225 there's not really any significance to them. This block is not
226 reached in kanjidic itself. */
227 break;
228 case 'B':
229 /* the radical, or busyu, number */
230 case 'C':
231 /* the classical radical number, usually doesn't differ from busyu number */
232 case 'E':
233 /* Henshell's "A Guide To Remembering Japanese Characters" index number */
234 case 'F':
235 /* frequency ranking */
236 case 'G':
237 /* grade level Jouyou 1 - 6 or 8 for common use or 9 for Jinmeiyou */
238 case 'H':
239 /* number from Halpern's New Japanese-English Character Dictionary */
240 case 'K':
241 /* Gakken Kanji Dictionary index */
242 case 'L':
243 /* Heisig's "Remembering The Kanji" index */
244 case 'N':
245 /* number from Nelson's Modern Reader's Japanese-English Character Dictionary */
246 case 'O':
247 /* O'Neill's "Japanese Names" index number */
248 case 'P':
249 /* SKIP code ... #-#-# format */
250 case 'Q':
251 /* Four Corner codes, it seems, can be multiple though I'm tempted just to take the last one. */
252 case 'U':
253 /* unicode which we are ignoring as it is found in another way */
254 case 'V':
255 /* number from Haig's New Nelson Japanese-English Character Dictionary */
256 case 'W':
257 /* korean reading */
258 case 'X':
259 /* I don't entirely understand this field. */
260 case 'Y':
261 /* Pinyin reading */
262 case 'Z':
263 /* SKIP misclassifications */
264
265 /* All of the above are of the format <Char><Data> where <Char> is
266 exactly 1 character. */
267 i++;
268 LOADSTRING( curString );
269 ExtendedInfo.insert( QString( ichar ), curString );
270 break;
271 case 'I':
272 /* index codes for Spahn & Hadamitzky reference books we need the next
273 char to know what to do with it. */
274 INCI
275 if( ichar == 'N' )
276 {
277 /* a Kanji & Kana book number */
278 LOADSTRING( curString )
279 }
280 else
281 {
282 /* The Kanji Dictionary number, we need the current ichar. */
283 LOADSTRING( curString )
284 }
285 ExtendedInfo.insert( 'I' + QString( ichar ), curString );
286 break;
287 case 'M':
288 /* index and page numbers for Morohashi's Daikanwajiten 2 fields possible */
289 INCI
290 if( ichar == 'N' )
291 {
292 LOADSTRING( curString )
293 /* index number */
294 }
295 else if( ichar == 'P' )
296 {
297 LOADSTRING( curString )
298 /* page number in volume.page format */
299 }
300 ExtendedInfo.insert( 'M' + QString( ichar ), curString );
301 break;
302 case 'S':
303 /* stroke count: may be multiple. In that case, first is actual, others common
304 miscounts */
305 i++;
306 if( ! ExtendedInfo.contains(QStringLiteral("S") ) )
307 {
308 LOADSTRING( curString )
309 ExtendedInfo.insert( QString( ichar ), curString );
310 }
311 else
312 {
313 LOADSTRING( curString )
314 ExtendedInfo.insert( '_' + QString( ichar ), curString );
315 }
316 break;
317 case 'D':
318 /* dictionary codes */
319 INCI
320 LOADSTRING( curString )
321 ExtendedInfo.insert( 'D' + QString( ichar ), curString );
322 break;
323 case '{':
324 /* This should be starting with the first '{' character of a meaning section.
325 Let us get take it to the last. */
326 INCI
327 while( ichar != '}' )
328 {
329 curString += ichar;
330 /* sanity */
331 if( i < length )
332 {
333 i++;
334 }
335 else
336 {
337 break;
338 }
339 ichar = entryLine.at( i );
340 }
341 INCI
342 // qDebug() << "Meaning's curString: '" << curString << "'";
343 Meanings.append( curString );
344 break;
345 case 'T': /* a reading that is used in names for T1, radical names for T2 */
346 {
347 i++;
348 LOADSTRING( curString )
349 // Get the type number (1 for T1, 2 for T2).
350 int type = curString.toInt();
351 bool finished = false;
352 while( ! finished )
353 {
354 // Skip all whitespaces.
355 INCI
356 while( ichar == ' ' )
357 {
358 INCI
359 }
360 // Check if the current character is Kana.
361 if( 0x3040 <= ichar.unicode() && ichar.unicode() <= 0x30FF )
362 {
363 // Reset our variable and load it with
364 // all available kana until we find a whitespace.
365 curString = QLatin1String("");
366 LOADSTRING( curString );
367 switch( type )
368 {
369 case 1: // Special reading used in names.
370 InNamesReadings.append( curString );
371 break;
372 case 2: // Reading as radical.
373 AsRadicalReadings.append( curString );
374 break;
375 }
376 }
377 else
378 {
379 // There are not more kana characters,
380 // so we finish this loop for now.
381 finished = true;
382 }
383 }
384 // Now 'i' points to a '{' character. We decrease its value
385 // so in the next loop we can reach the "case '{'" section.
386 i--;
387 }
388 break;
389 case '-':
390 /* a reading that is only in postposition */
391 /* any of those 2 signals a reading is to ensue. */
392 LOADSTRING( curString )
393 originalReadings.append( curString );
394
395 // If it is Hiragana (Kunyomi)
396 if( 0x3040 <= ichar.unicode() && ichar.unicode() <= 0x309F )
397 {
398 KunyomiReadings.append( curString );
399 }
400 // If it is Katakana (Onyomi)
401 else if( 0x30A0 <= ichar.unicode() && ichar.unicode() <= 0x30FF )
402 {
403 OnyomiReadings.append( curString );
404 }
405
406 curString = curString.remove( '-' ).remove( '.' );
407 Readings.append( curString );
408 break;
409 default:
410 /* either a character we don't address or a problem...we should ignore it */
411 // qDebug() << "hit default in kanji parser. Unicode: '" << ichar.unicode() << "'";
412
413 /* This should detect unicode kana */
414 // Hiragana 0x3040 - 0x309F, Katakana: 0x30A0 - 0x30FF
415 if( 0x3040 <= ichar.unicode() && ichar.unicode() <= 0x30FF )
416 {
417 LOADSTRING( curString )
418 originalReadings.append( curString );
419
420 // If it is Hiragana (Kunyomi)
421 if( 0x3040 <=ichar.unicode() && ichar.unicode() <= 0x309F )
422 {
423 KunyomiReadings.append( curString );
424 }
425 // If it is Katakana (Onyomi)
426 else if( 0x30A0 <= ichar.unicode() && ichar.unicode() <= 0x30FF )
427 {
428 OnyomiReadings.append( curString );
429 }
430
431 curString = curString.remove( '-' ).remove( '.' );
432 Readings.append( curString );
433 break;
434 }
435 /* if it's not a kana reading ... it is something unhandled ...
436 possibly a new field in kanjidic. Let's treat it as the
437 oh-so-common <char><data> type of entry. It could be hotly
438 debated what we should actually do about these. */
439 i++;
440 LOADSTRING( curString );
441 ExtendedInfo.insert( QString( ichar ), curString );
442
443 break;
444 }
445 }
446 // qDebug() << "Parsed: '"<<Word<<"' ("<<Readings.join("^")<<") \""<<
447 // Meanings.join("|")<<"\ and " <<ExtendedInfo.keys() << " from :"<<entryLine<<endl;
448
449 return true;
450 }
451
makeReadingLink(const QString & inReading) const452 QString EntryKanjidic::makeReadingLink( const QString &inReading ) const
453 {
454 QString reading = inReading;
455 return QStringLiteral( "<a href=\"%1\">%2</a>" ).arg( reading.remove( '.' ).remove( '-' ) )
456 .arg( inReading );
457 }
458
459 /**
460 * Returns a HTML version of an Entry
461 */
toHTML() const462 QString EntryKanjidic::toHTML() const
463 {
464 QString result = QStringLiteral("<div class=\"KanjidicBrief\">");
465
466 foreach( const QString &field, QSTRINGLISTCHECK( DictFileKanjidic::displayFields ) )
467 {
468 //qDebug() << "Display: "<<field;
469 if( field == QLatin1String("--NewLine--") ) result += QLatin1String("<br>");
470 else if( field == QLatin1String("Word/Kanji") ) result += HTMLWord() + ' ';
471 else if( field == QLatin1String("Meaning") ) result += HTMLMeanings() + ' ';
472 else if( field == QLatin1String("Reading") ) result += HTMLReadings() + ' ';
473 else if( ExtendedInfo.contains( field ) ) result += HTMLExtendedInfo( field ) + ' ';
474 }
475
476 result += QLatin1String("</div>");
477 return result;
478 }
479