1 /* 2 * JaLingo, http://jalingo.sourceforge.net/ 3 * 4 * Copyright (c) 2002-2006 Oleksandr Shyshko 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation; either version 2 of the License, or 9 * (at your option) any later version. 10 * 11 * This program is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with this program; if not, write to the Free Software 18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 19 */ 20 21 package ja.lingo.readers.mova; 22 23 import ja.lingo.engine.beans.IInfo; 24 import ja.lingo.engine.reader.util.BaseConverter; 25 import ja.centre.util.regex.Replacers; 26 import ja.centre.util.regex.IReplacer; 27 28 import java.util.LinkedList; 29 import java.util.regex.Pattern; 30 31 class MovaConverter extends BaseConverter { MovaConverter( IInfo info )32 public MovaConverter( IInfo info ) { 33 super( info ); 34 } 35 getBody( byte[] titleBytes, int titleOffset, int titleLength, byte[] bodyBytes, int bodyOffset, int bodyLength )36 public String getBody( byte[] titleBytes, int titleOffset, int titleLength, byte[] bodyBytes, int bodyOffset, int bodyLength ) { 37 return new MovaConverterRequest( convertIpa2Unicode( bodyBytes, bodyOffset, bodyLength ) ).asString(); 38 } 39 40 private static class MovaConverterRequest { 41 private static final String[] romanListElements = { "_I", "_II", "_III", "_IV", "_V", "_VI", "_VII", "_VIII", "_IX", "_X" }; 42 private static final String UPPER_ROMAN = "1"; 43 private static final String ARABIC_NUMBERS = "2"; 44 private static final String UPPER_ALPHA = "3"; 45 private static final String LOWER_ALPHA = "4"; 46 47 private static final Pattern PATTERN_SPLIT = Pattern.compile( " " ); 48 private static final IReplacer REPLACER_TRANS = Replacers.regex( "(\\[.*?\\])", "<span class=\"trans\">$1</span>" ); 49 private static final IReplacer REPLACER_KEYWORD = Replacers.regex( "_(\\S+?[\\.\\:])", "<span class=\"keywd\">$1</span>" ); 50 51 private StringBuilder builder = new StringBuilder(); 52 private LinkedList<ListEntry> listStack = new LinkedList<ListEntry>(); 53 54 private boolean lastWasCharacters; 55 private String body; 56 MovaConverterRequest( String articleBody )57 public MovaConverterRequest( String articleBody ) { 58 // TODO optimize split 59 for ( String token : PATTERN_SPLIT.split( articleBody ) ) { 60 if ( isRomanListElement( token ) ) { 61 processListElement( UPPER_ROMAN ); 62 } else if ( isArabicPointListElement( token ) ) { 63 processListElement( ARABIC_NUMBERS ); 64 } else if ( isArabicGTListElement( token ) ) { 65 processListElement( UPPER_ALPHA ); 66 } else if ( isLetterGTListElement( token ) ) { 67 processListElement( LOWER_ALPHA ); 68 } else { 69 if ( lastWasCharacters ) { 70 builder.append( " " ); 71 } 72 73 builder.append( token ); 74 75 lastWasCharacters = true; 76 } 77 } 78 flushListsIfRequired(); 79 80 String body = builder.toString(); // regex faster are faster on Strings 81 82 // [transcription] - "(\[.*?\])" 83 // TODO optimize 84 body = REPLACER_TRANS.replace( body ); 85 86 // _n. ... _pl. - "_(\S+?[\.\:])" 87 // TODO optimize 88 body = REPLACER_KEYWORD.replace( body ); 89 90 this.body = body; 91 92 builder = null; 93 } 94 asString()95 public String asString() { 96 return body; 97 } 98 processListElement( String style )99 private void processListElement( String style ) { 100 ListEntry entry = findInListStack( style ); 101 if ( entry == null ) { 102 entry = new ListEntry( style ); 103 addListToStack( entry ); 104 } else { 105 flushListsIfRequired( entry ); 106 } 107 108 entry.increment(); 109 flushLastListElement(); 110 addListElementToStack( style ); 111 112 lastWasCharacters = false; 113 } 114 flushLastListElement()115 private void flushLastListElement() { 116 while ( !listStack.isEmpty() ) { 117 if ( listStack.getLast().isElement() ) { 118 builder.append( "</span></li>" ); 119 listStack.removeLast(); 120 } else { 121 break; 122 } 123 } 124 } flushListsIfRequired()125 private void flushListsIfRequired() { 126 flushListsIfRequired( null ); 127 } flushListsIfRequired( ListEntry entry )128 private void flushListsIfRequired( ListEntry entry ) { 129 while ( listStack.size() > 0 ) { 130 ListEntry currentEntry = listStack.getLast(); 131 if ( entry == currentEntry ) { 132 break; 133 } 134 builder.append( currentEntry.isElement() ? "</span></li>" : "</ol>" ); 135 listStack.removeLast(); 136 } 137 } 138 isRomanListElement( String token )139 private boolean isRomanListElement( String token ) { 140 if ( !token.startsWith( "_" ) ) { 141 return false; 142 } 143 144 for ( int i = 0; i < romanListElements.length; i++ ) { 145 if ( token.equals( romanListElements[i] ) ) { 146 return true; 147 } 148 } 149 150 return false; 151 } isArabicPointListElement( String token )152 private boolean isArabicPointListElement( String token ) { 153 if ( !token.endsWith( "." ) ) { 154 return false; 155 } 156 157 for ( int i = 0; i < token.length() - 1; i++ ) { 158 char c = token.charAt( i ); 159 if ( c < '0' || c > '9' ) { 160 return false; 161 } 162 } 163 return true; 164 } isArabicGTListElement( String token )165 private boolean isArabicGTListElement( String token ) { 166 if ( !token.endsWith( ">" ) ) { 167 return false; 168 } 169 170 for ( int i = 0; i < token.length() - 1; i++ ) { 171 char c = token.charAt( i ); 172 if ( c < '0' || c > '9' ) { 173 return false; 174 } 175 } 176 return true; 177 } isLetterGTListElement( String token )178 private boolean isLetterGTListElement( String token ) { 179 if ( !token.endsWith( ">" ) ) { 180 return false; 181 } 182 183 char c = token.charAt( 0 ); 184 185 if ( c < 0x0430 || c > 0x044f ) { 186 return false; 187 } 188 189 return true; 190 } 191 findInListStack( String name )192 private ListEntry findInListStack( String name ) { 193 for ( int i = listStack.size() - 1; i >= 0; i-- ) { 194 ListEntry entry = listStack.get( i ); 195 if ( name.equals( entry.name ) ) { 196 return entry; 197 } 198 } 199 return null; 200 } addListToStack( ListEntry entry )201 private void addListToStack( ListEntry entry ) { 202 builder.append( "<ol class=\"" ).append( entry.getName() ).append( "\">" ); 203 listStack.add( entry ); 204 } addListElementToStack( String name )205 private void addListElementToStack( String name ) { 206 ListEntry entry = new ListEntry( name, true ); 207 builder.append( "<li><span class='black'>" ); 208 listStack.add( entry ); 209 } 210 211 private static class ListEntry { 212 private String name; 213 private int value; 214 private boolean isElement; 215 ListEntry( String name )216 public ListEntry( String name ) { 217 this( name, false ); 218 } ListEntry( String name, boolean element )219 public ListEntry( String name, boolean element ) { 220 this.name = name; 221 isElement = element; 222 } 223 getName()224 public String getName() { 225 return name; 226 } getValue()227 public int getValue() { 228 return value; 229 } 230 increment()231 public void increment() { 232 value++; 233 } 234 isElement()235 public boolean isElement() { 236 return isElement; 237 } 238 } 239 } 240 convertIpa2Unicode( byte[] dataBytes, int bodyOffset, int bodyLength )241 public String convertIpa2Unicode( byte[] dataBytes, int bodyOffset, int bodyLength ) { 242 StringBuilder builder = new StringBuilder( createString( dataBytes, bodyOffset, bodyLength ) ); 243 int bracketDepth = 0; 244 for ( int i = 0; i < builder.length(); i++ ) { 245 char currentChar = builder.charAt( i ); 246 if ( currentChar == '[' ) { 247 bracketDepth++; 248 } else if ( currentChar == ']' ) { 249 bracketDepth--; 250 } 251 252 if ( bracketDepth > 0 ) { 253 int b = (dataBytes[i + bodyOffset] & 0xFF); 254 char newChar = IPA_2_UNICODE_TABLE[b]; 255 builder.setCharAt( i, newChar ); 256 } 257 } 258 259 return builder.toString(); 260 } 261 262 private final char[] IPA_2_UNICODE_TABLE = { 263 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, // 0 264 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, // 16 265 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, // 32 266 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, // 48 267 0x0020, 0x030B, 0x0131, 0x0304, 0x0300, 0x030F, 0x030C, 0x02BC, // 64 268 269 0x0306, 0x0303, 0x030A, 0x031F, 0x002C, 0x0324, 0x002E, 0x002F, 270 0x0330, 0x0318, 0x0319, 0x031D, 0x031E, 0x032A, 0x033B, 0x031C, 271 0x0325, 0x032F, 0x02E1, 0x029F, 0x207F, 0x0320, 0x02D1, 0x0294, 272 0x0301, 0x0251, 0x03B2, 0x0063, 0x00F0, 0x025B, 0x0264, 0x0262, 273 0x02B0, 0x026A, 0x02B2, 0x029C, 0x026E, 0x0271, 0x014B, 0x00F8, 274 275 0x0275, 0x00E6, 0x027E, 0x0283, 0x03B8, 0x028A, 0x028B, 0x02B7, 276 0x03C7, 0x028F, 0x0292, 0x005B, 0x005C, 0x005D, 0x0302, 0x0308, 277 0x0329, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0261, 278 0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F, 279 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 280 281 0x0078, 0x0079, 0x007A, 0x0280, 0x031A, 0x027D, 0x033D, 0x007F, 282 0x02E9, 0x0252, 0x0258, 0x0361, 0x2016, 0x02E5, 0x02E5, 0x0298, 283 0x030B, 0x030B, 0x02E5, 0x2191, 0x0250, 0x0254, 0x01C0, 0x0301, 284 0x0301, 0x02E6, 0x01C1, 0x0304, 0x0304, 0x02E7, 0x007C, 0x01C3, 285 0x0300, 0x0300, 0x02E8, 0x2193, 0x01C2, 0x030F, 0x030F, 0x02E9, 286 287 0x0020, 0x030A, 0x031E, 0x031D, 0x032C, 0x0325, 0x0339, 0x0282, 288 0x0279, 0x0260, 0x0319, 0x0259, 0x0289, 0x0320, 0x0268, 0x0276, 289 0x033A, 0x031F, 0x0274, 0x02E4, 0x028E, 0x026F, 0x0020, 0x0020, 290 0x0278, 0x02A2, 0x0253, 0x032F, 0x0330, 0x0290, 0x006A, 0x0153, 291 0x0295, 0x0318, 0x026C, 0x028C, 0x0263, 0x0020, 0x029D, 0x02CC, 292 293 0x02C8, 0xF180, 0x200A, 0xF181, 0x2197, 0x2198, 0x025C, 0x025E, 294 0x0324, 0x033C, 0x0281, 0x027B, 0xF182, 0x02DE, 0x002D, 0x0284, 295 0x02E7, 0x02E7, 0x030B, 0x0301, 0x0304, 0x0300, 0x030F, 0x0302, 296 0x030C, 0x0306, 0x0303, 0x028D, 0x027A, 0x0270, 0x0302, 0x0265, 297 0x02E9, 0x0302, 0x0256, 0x0257, 0x02E0, 0x203F, 0x0267, 0x025F, 298 299 0x0127, 0x026D, 0x026B, 0x030C, 0x030C, 0x0299, 0x0268, 0x0273, 300 0x0272, 0x003A, 0x0266, 0x02A1, 0x0291, 0x029B, 0x0255, 0x0288 301 }; 302 } 303