1 /*
2  * JaLingo, http://jalingo.sourceforge.net/
3  *
4  * Copyright (c) 2002-2006 Oleksandr Shyshko
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19  */
20 
21 package ja.lingo.readers.mova;
22 
23 import ja.lingo.engine.beans.IInfo;
24 import ja.lingo.engine.reader.util.BaseConverter;
25 import ja.centre.util.regex.Replacers;
26 import ja.centre.util.regex.IReplacer;
27 
28 import java.util.LinkedList;
29 import java.util.regex.Pattern;
30 
31 class MovaConverter extends BaseConverter {
MovaConverter( IInfo info )32     public MovaConverter( IInfo info ) {
33         super( info );
34     }
35 
getBody( byte[] titleBytes, int titleOffset, int titleLength, byte[] bodyBytes, int bodyOffset, int bodyLength )36     public String getBody( byte[] titleBytes, int titleOffset, int titleLength, byte[] bodyBytes, int bodyOffset, int bodyLength ) {
37         return new MovaConverterRequest( convertIpa2Unicode( bodyBytes, bodyOffset, bodyLength ) ).asString();
38     }
39 
40     private static class MovaConverterRequest {
41         private static final String[] romanListElements = { "_I", "_II", "_III", "_IV", "_V", "_VI", "_VII", "_VIII", "_IX", "_X" };
42         private static final String UPPER_ROMAN = "1";
43         private static final String ARABIC_NUMBERS = "2";
44         private static final String UPPER_ALPHA = "3";
45         private static final String LOWER_ALPHA = "4";
46 
47         private static final Pattern PATTERN_SPLIT = Pattern.compile( " " );
48         private static final IReplacer REPLACER_TRANS = Replacers.regex( "(\\[.*?\\])", "<span class=\"trans\">$1</span>" );
49         private static final IReplacer REPLACER_KEYWORD = Replacers.regex( "_(\\S+?[\\.\\:])", "<span class=\"keywd\">$1</span>" );
50 
51         private StringBuilder builder = new StringBuilder();
52         private LinkedList<ListEntry> listStack = new LinkedList<ListEntry>();
53 
54         private boolean lastWasCharacters;
55         private String body;
56 
MovaConverterRequest( String articleBody )57         public MovaConverterRequest( String articleBody ) {
58             // TODO optimize split
59             for ( String token : PATTERN_SPLIT.split( articleBody ) ) {
60                 if ( isRomanListElement( token ) ) {
61                     processListElement( UPPER_ROMAN );
62                 } else if ( isArabicPointListElement( token ) ) {
63                     processListElement( ARABIC_NUMBERS );
64                 } else if ( isArabicGTListElement( token ) ) {
65                     processListElement( UPPER_ALPHA );
66                 } else if ( isLetterGTListElement( token ) ) {
67                     processListElement( LOWER_ALPHA );
68                 } else {
69                     if ( lastWasCharacters ) {
70                         builder.append( " " );
71                     }
72 
73                     builder.append( token );
74 
75                     lastWasCharacters = true;
76                 }
77             }
78             flushListsIfRequired();
79 
80             String body = builder.toString(); // regex faster are faster on Strings
81 
82             // [transcription] - "(\[.*?\])"
83             // TODO optimize
84             body = REPLACER_TRANS.replace( body );
85 
86             // _n. ... _pl. - "_(\S+?[\.\:])"
87             // TODO optimize
88             body = REPLACER_KEYWORD.replace( body );
89 
90             this.body = body;
91 
92             builder = null;
93         }
94 
asString()95         public String asString() {
96             return body;
97         }
98 
processListElement( String style )99         private void processListElement( String style ) {
100             ListEntry entry = findInListStack( style );
101             if ( entry == null ) {
102                 entry = new ListEntry( style );
103                 addListToStack( entry );
104             } else {
105                 flushListsIfRequired( entry );
106             }
107 
108             entry.increment();
109             flushLastListElement();
110             addListElementToStack( style );
111 
112             lastWasCharacters = false;
113         }
114 
flushLastListElement()115         private void flushLastListElement() {
116             while ( !listStack.isEmpty() ) {
117                 if ( listStack.getLast().isElement() ) {
118                     builder.append( "</span></li>" );
119                     listStack.removeLast();
120                 } else {
121                     break;
122                 }
123             }
124         }
flushListsIfRequired()125         private void flushListsIfRequired() {
126             flushListsIfRequired( null );
127         }
flushListsIfRequired( ListEntry entry )128         private void flushListsIfRequired( ListEntry entry ) {
129             while ( listStack.size() > 0 ) {
130                 ListEntry currentEntry = listStack.getLast();
131                 if ( entry == currentEntry ) {
132                     break;
133                 }
134                 builder.append( currentEntry.isElement() ? "</span></li>" : "</ol>" );
135                 listStack.removeLast();
136             }
137         }
138 
isRomanListElement( String token )139         private boolean isRomanListElement( String token ) {
140             if ( !token.startsWith( "_" ) ) {
141                 return false;
142             }
143 
144             for ( int i = 0; i < romanListElements.length; i++ ) {
145                 if ( token.equals( romanListElements[i] ) ) {
146                     return true;
147                 }
148             }
149 
150             return false;
151         }
isArabicPointListElement( String token )152         private boolean isArabicPointListElement( String token ) {
153             if ( !token.endsWith( "." ) ) {
154                 return false;
155             }
156 
157             for ( int i = 0; i < token.length() - 1; i++ ) {
158                 char c = token.charAt( i );
159                 if ( c < '0' || c > '9' ) {
160                     return false;
161                 }
162             }
163             return true;
164         }
isArabicGTListElement( String token )165         private boolean isArabicGTListElement( String token ) {
166             if ( !token.endsWith( ">" ) ) {
167                 return false;
168             }
169 
170             for ( int i = 0; i < token.length() - 1; i++ ) {
171                 char c = token.charAt( i );
172                 if ( c < '0' || c > '9' ) {
173                     return false;
174                 }
175             }
176             return true;
177         }
isLetterGTListElement( String token )178         private boolean isLetterGTListElement( String token ) {
179             if ( !token.endsWith( ">" ) ) {
180                 return false;
181             }
182 
183             char c = token.charAt( 0 );
184 
185             if ( c < 0x0430 || c > 0x044f ) {
186                 return false;
187             }
188 
189             return true;
190         }
191 
findInListStack( String name )192         private ListEntry findInListStack( String name ) {
193             for ( int i = listStack.size() - 1; i >= 0; i-- ) {
194                 ListEntry entry = listStack.get( i );
195                 if ( name.equals( entry.name ) ) {
196                     return entry;
197                 }
198             }
199             return null;
200         }
addListToStack( ListEntry entry )201         private void addListToStack( ListEntry entry ) {
202             builder.append( "<ol class=\"" ).append( entry.getName() ).append( "\">" );
203             listStack.add( entry );
204         }
addListElementToStack( String name )205         private void addListElementToStack( String name ) {
206             ListEntry entry = new ListEntry( name, true );
207             builder.append( "<li><span class='black'>" );
208             listStack.add( entry );
209         }
210 
211         private static class ListEntry {
212             private String name;
213             private int value;
214             private boolean isElement;
215 
ListEntry( String name )216             public ListEntry( String name ) {
217                 this( name, false );
218             }
ListEntry( String name, boolean element )219             public ListEntry( String name, boolean element ) {
220                 this.name = name;
221                 isElement = element;
222             }
223 
getName()224             public String getName() {
225                 return name;
226             }
getValue()227             public int getValue() {
228                 return value;
229             }
230 
increment()231             public void increment() {
232                 value++;
233             }
234 
isElement()235             public boolean isElement() {
236                 return isElement;
237             }
238         }
239     }
240 
convertIpa2Unicode( byte[] dataBytes, int bodyOffset, int bodyLength )241     public String convertIpa2Unicode( byte[] dataBytes, int bodyOffset, int bodyLength ) {
242         StringBuilder builder = new StringBuilder( createString( dataBytes, bodyOffset, bodyLength ) );
243         int bracketDepth = 0;
244         for ( int i = 0; i < builder.length(); i++ ) {
245             char currentChar = builder.charAt( i );
246             if ( currentChar == '[' ) {
247                 bracketDepth++;
248             } else if ( currentChar == ']' ) {
249                 bracketDepth--;
250             }
251 
252             if ( bracketDepth > 0 ) {
253                 int b = (dataBytes[i + bodyOffset] & 0xFF);
254                 char newChar = IPA_2_UNICODE_TABLE[b];
255                 builder.setCharAt( i, newChar );
256             }
257         }
258 
259         return builder.toString();
260     }
261 
262     private final char[] IPA_2_UNICODE_TABLE = {
263             0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, // 0
264             0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, // 16
265             0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, // 32
266             0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, // 48
267             0x0020, 0x030B, 0x0131, 0x0304, 0x0300, 0x030F, 0x030C, 0x02BC, // 64
268 
269             0x0306, 0x0303, 0x030A, 0x031F, 0x002C, 0x0324, 0x002E, 0x002F,
270             0x0330, 0x0318, 0x0319, 0x031D, 0x031E, 0x032A, 0x033B, 0x031C,
271             0x0325, 0x032F, 0x02E1, 0x029F, 0x207F, 0x0320, 0x02D1, 0x0294,
272             0x0301, 0x0251, 0x03B2, 0x0063, 0x00F0, 0x025B, 0x0264, 0x0262,
273             0x02B0, 0x026A, 0x02B2, 0x029C, 0x026E, 0x0271, 0x014B, 0x00F8,
274 
275             0x0275, 0x00E6, 0x027E, 0x0283, 0x03B8, 0x028A, 0x028B, 0x02B7,
276             0x03C7, 0x028F, 0x0292, 0x005B, 0x005C, 0x005D, 0x0302, 0x0308,
277             0x0329, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0261,
278             0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F,
279             0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
280 
281             0x0078, 0x0079, 0x007A, 0x0280, 0x031A, 0x027D, 0x033D, 0x007F,
282             0x02E9, 0x0252, 0x0258, 0x0361, 0x2016, 0x02E5, 0x02E5, 0x0298,
283             0x030B, 0x030B, 0x02E5, 0x2191, 0x0250, 0x0254, 0x01C0, 0x0301,
284             0x0301, 0x02E6, 0x01C1, 0x0304, 0x0304, 0x02E7, 0x007C, 0x01C3,
285             0x0300, 0x0300, 0x02E8, 0x2193, 0x01C2, 0x030F, 0x030F, 0x02E9,
286 
287             0x0020, 0x030A, 0x031E, 0x031D, 0x032C, 0x0325, 0x0339, 0x0282,
288             0x0279, 0x0260, 0x0319, 0x0259, 0x0289, 0x0320, 0x0268, 0x0276,
289             0x033A, 0x031F, 0x0274, 0x02E4, 0x028E, 0x026F, 0x0020, 0x0020,
290             0x0278, 0x02A2, 0x0253, 0x032F, 0x0330, 0x0290, 0x006A, 0x0153,
291             0x0295, 0x0318, 0x026C, 0x028C, 0x0263, 0x0020, 0x029D, 0x02CC,
292 
293             0x02C8, 0xF180, 0x200A, 0xF181, 0x2197, 0x2198, 0x025C, 0x025E,
294             0x0324, 0x033C, 0x0281, 0x027B, 0xF182, 0x02DE, 0x002D, 0x0284,
295             0x02E7, 0x02E7, 0x030B, 0x0301, 0x0304, 0x0300, 0x030F, 0x0302,
296             0x030C, 0x0306, 0x0303, 0x028D, 0x027A, 0x0270, 0x0302, 0x0265,
297             0x02E9, 0x0302, 0x0256, 0x0257, 0x02E0, 0x203F, 0x0267, 0x025F,
298 
299             0x0127, 0x026D, 0x026B, 0x030C, 0x030C, 0x0299, 0x0268, 0x0273,
300             0x0272, 0x003A, 0x0266, 0x02A1, 0x0291, 0x029B, 0x0255, 0x0288
301     };
302 }
303