1 /******************************************************************************
2 *
3 * thmlosis.cpp - filter to convert ThML to OSIS
4 *
5 * $Id: thmlosis.cpp 2833 2013-06-29 06:40:28Z chrislit $
6 *
7 * Copyright 2002-2013 CrossWire Bible Society (http://www.crosswire.org)
8 * CrossWire Bible Society
9 * P. O. Box 2528
10 * Tempe, AZ 85280-2528
11 *
12 * This program is free software; you can redistribute it and/or modify it
13 * under the terms of the GNU General Public License as published by the
14 * Free Software Foundation version 2.
15 *
16 * This program is distributed in the hope that it will be useful, but
17 * WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 */
22
23 #include <stdlib.h>
24 #include <stdio.h>
25 #include <stdarg.h>
26 #include <ctype.h>
27 #include <thmlosis.h>
28 #include <swmodule.h>
29 #include <swlog.h>
30 #include <versekey.h>
31 #include <utilstr.h>
32 #include <utilxml.h>
33
34
35 SWORD_NAMESPACE_START
36
ThMLOSIS()37 ThMLOSIS::ThMLOSIS() {
38 }
39
40
~ThMLOSIS()41 ThMLOSIS::~ThMLOSIS() {
42 }
43
44
processText(SWBuf & text,const SWKey * key,const SWModule * module)45 char ThMLOSIS::processText(SWBuf &text, const SWKey *key, const SWModule *module) {
46 char token[2048]; // cheese. Fix.
47 int tokpos = 0;
48 bool intoken = false;
49 bool keepToken = false;
50 bool ampersand = false;
51
52 // static QuoteStack quoteStack;
53
54 bool lastspace = false;
55 char val[128];
56 SWBuf buf;
57 char *valto;
58 char *ch;
59
60 const char *wordStart = text.c_str();
61 const char *wordEnd = NULL;
62
63 const char *textStart = NULL;
64 const char *textEnd = NULL;
65
66 bool suspendTextPassThru = false;
67 bool handled = false;
68 bool newText = false;
69 bool newWord = false;
70
71 // SWBuf tmp;
72 SWBuf divEnd = "";
73
74 SWBuf orig = text;
75 const char* from = orig.c_str();
76
77 text = "";
78 for (from = orig.c_str(); *from; ++from) {
79
80 // handle silly <variant word> items in greek whnu, remove when module is fixed
81 if ((*from == '<') && (*(from+1) < 0)) {
82 text += "<";
83 continue;
84 }
85
86 if (*from == '<') { //start of new token detected
87 intoken = true;
88 tokpos = 0;
89 token[0] = 0;
90 token[1] = 0;
91 token[2] = 0;
92 ampersand = false;
93 textEnd = from-1;
94 wordEnd = text.c_str() + text.length();//not good, instead of wordEnd = to!
95
96 // wordEnd = to;
97 continue;
98 }
99
100 if (*from == '&') {
101 intoken = true;
102 tokpos = 0;
103 token[0] = 0;
104 token[1] = 0;
105 token[2] = 0;
106 ampersand = true;
107 continue;
108 }
109
110 if (*from == ';' && ampersand) {
111 intoken = false;
112 ampersand = false;
113
114 if (*token == '#') {
115 text += '&';
116 text += token;
117 text += ';';
118 }
119 else if (!strncmp("nbsp", token, 4)) text += ' ';
120 else if (!strncmp("quot", token, 4)) text += '"';
121 else if (!strncmp("amp", token, 3)) text += '&';
122 else if (!strncmp("lt", token, 2)) text += '<';
123 else if (!strncmp("gt", token, 2)) text += '>';
124 else if (!strncmp("brvbar", token, 6)) text += '�';
125 else if (!strncmp("sect", token, 4)) text += '�';
126 else if (!strncmp("copy", token, 4)) text += '�';
127 else if (!strncmp("laquo", token, 5)) text += '�';
128 else if (!strncmp("reg", token, 3)) text += '�';
129 else if (!strncmp("acute", token, 5)) text += '�';
130 else if (!strncmp("para", token, 4)) text += '�';
131 else if (!strncmp("raquo", token, 5)) text += '�';
132 else if (!strncmp("Aacute", token, 6)) text += '�';
133 else if (!strncmp("Agrave", token, 6)) text += '�';
134 else if (!strncmp("Acirc", token, 5)) text += '�';
135 else if (!strncmp("Auml", token, 4)) text += '�';
136 else if (!strncmp("Atilde", token, 6)) text += '�';
137 else if (!strncmp("Aring", token, 5)) text += '�';
138 else if (!strncmp("aacute", token, 6)) text += '�';
139 else if (!strncmp("agrave", token, 6)) text += '�';
140 else if (!strncmp("acirc", token, 5)) text += '�';
141 else if (!strncmp("auml", token, 4)) text += '�';
142 else if (!strncmp("atilde", token, 6)) text += '�';
143 else if (!strncmp("aring", token, 5)) text += '�';
144 else if (!strncmp("Eacute", token, 6)) text += '�';
145 else if (!strncmp("Egrave", token, 6)) text += '�';
146 else if (!strncmp("Ecirc", token, 5)) text += '�';
147 else if (!strncmp("Euml", token, 4)) text += '�';
148 else if (!strncmp("eacute", token, 6)) text += '�';
149 else if (!strncmp("egrave", token, 6)) text += '�';
150 else if (!strncmp("ecirc", token, 5)) text += '�';
151 else if (!strncmp("euml", token, 4)) text += '�';
152 else if (!strncmp("Iacute", token, 6)) text += '�';
153 else if (!strncmp("Igrave", token, 6)) text += '�';
154 else if (!strncmp("Icirc", token, 5)) text += '�';
155 else if (!strncmp("Iuml", token, 4)) text += '�';
156 else if (!strncmp("iacute", token, 6)) text += '�';
157 else if (!strncmp("igrave", token, 6)) text += '�';
158 else if (!strncmp("icirc", token, 5)) text += '�';
159 else if (!strncmp("iuml", token, 4)) text += '�';
160 else if (!strncmp("Oacute", token, 6)) text += '�';
161 else if (!strncmp("Ograve", token, 6)) text += '�';
162 else if (!strncmp("Ocirc", token, 5)) text += '�';
163 else if (!strncmp("Ouml", token, 4)) text += '�';
164 else if (!strncmp("Otilde", token, 6)) text += '�';
165 else if (!strncmp("oacute", token, 6)) text += '�';
166 else if (!strncmp("ograve", token, 6)) text += '�';
167 else if (!strncmp("ocirc", token, 5)) text += '�';
168 else if (!strncmp("ouml", token, 4)) text += '�';
169 else if (!strncmp("otilde", token, 6)) text += '�';
170 else if (!strncmp("Uacute", token, 6)) text += '�';
171 else if (!strncmp("Ugrave", token, 6)) text += '�';
172 else if (!strncmp("Ucirc", token, 5)) text += '�';
173 else if (!strncmp("Uuml", token, 4)) text += '�';
174 else if (!strncmp("uacute", token, 6)) text += '�';
175 else if (!strncmp("ugrave", token, 6)) text += '�';
176 else if (!strncmp("ucirc", token, 5)) text += '�';
177 else if (!strncmp("uuml", token, 4)) text += '�';
178 else if (!strncmp("Yacute", token, 6)) text += '�';
179 else if (!strncmp("yacute", token, 6)) text += '�';
180 else if (!strncmp("yuml", token, 4)) text += '�';
181
182 else if (!strncmp("deg", token, 3)) text += '�';
183 else if (!strncmp("plusmn", token, 6)) text += '�';
184 else if (!strncmp("sup2", token, 4)) text += '�';
185 else if (!strncmp("sup3", token, 4)) text += '�';
186 else if (!strncmp("sup1", token, 4)) text += '�';
187 else if (!strncmp("nbsp", token, 4)) text += '�';
188 else if (!strncmp("pound", token, 5)) text += '�';
189 else if (!strncmp("cent", token, 4)) text += '�';
190 else if (!strncmp("frac14", token, 6)) text += '�';
191 else if (!strncmp("frac12", token, 6)) text += '�';
192 else if (!strncmp("frac34", token, 6)) text += '�';
193 else if (!strncmp("iquest", token, 6)) text += '�';
194 else if (!strncmp("iexcl", token, 5)) text += '�';
195 else if (!strncmp("ETH", token, 3)) text += '�';
196 else if (!strncmp("eth", token, 3)) text += '�';
197 else if (!strncmp("THORN", token, 5)) text += '�';
198 else if (!strncmp("thorn", token, 5)) text += '�';
199 else if (!strncmp("AElig", token, 5)) text += '�';
200 else if (!strncmp("aelig", token, 5)) text += '�';
201 else if (!strncmp("Oslash", token, 6)) text += '�';
202 else if (!strncmp("curren", token, 6)) text += '�';
203 else if (!strncmp("Ccedil", token, 6)) text += '�';
204 else if (!strncmp("ccedil", token, 6)) text += '�';
205 else if (!strncmp("szlig", token, 5)) text += '�';
206 else if (!strncmp("Ntilde", token, 6)) text += '�';
207 else if (!strncmp("ntilde", token, 6)) text += '�';
208 else if (!strncmp("yen", token, 3)) text += '�';
209 else if (!strncmp("not", token, 3)) text += '�';
210 else if (!strncmp("ordf", token, 4)) text += '�';
211 else if (!strncmp("uml", token, 3)) text += '�';
212 else if (!strncmp("shy", token, 3)) text += '�';
213 else if (!strncmp("macr", token, 4)) text += '�';
214 else if (!strncmp("micro", token, 5)) text += "�";
215 else if (!strncmp("middot", token, 6)) text +="�";
216 else if (!strncmp("cedil", token, 5)) text += "�";
217 else if (!strncmp("ordm", token, 4)) text += "�";
218 else if (!strncmp("times", token, 5)) text += "�";
219 else if (!strncmp("divide", token, 6)) text +="�";
220 else if (!strncmp("oslash", token, 6)) text +="�";
221 continue;
222 }
223
224 // handle silly <variant word> items in greek whnu, remove when module is fixed
225 if ((*from == '>') && (*(from-1) < 0)) {
226 text += ">";
227 continue;
228 }
229
230 if (*from == '>') { // process tokens
231 intoken = false;
232 keepToken = false;
233 suspendTextPassThru = false;
234 newWord = true;
235 handled = false;
236
237 while (wordStart < (text.c_str() + text.length())) { //hack
238 if (strchr(";,. :?!()'\"", *wordStart) && wordStart[0] && wordStart[1])
239 wordStart++;
240 else break;
241 }
242 while (wordEnd > wordStart) {
243 if (strchr(" ,;:.?!()'\"", *wordEnd))
244 wordEnd--;
245 else break;
246 }
247
248 // variants
249 if (!strncmp(token, "div type=\"variant\"", 18)) {
250 XMLTag tag = token;
251 text.append("<seg type=\"x-variant\"");
252 SWBuf cls = "x-class:";
253 cls += tag.getAttribute("class");
254 if (cls.length()>8)
255 text.appendFormatted(" subType=\"%s\"", cls.c_str());
256
257 text += ">";
258 divEnd = "</seg>";
259 newText = true;
260 lastspace = false;
261 handled = true;
262 }
263 // section titles
264 if (!strcmp(token, "div class=\"sechead\"")) {
265 // pushString(&to, "<title>");
266 text.append("<title>");
267 divEnd = "</title>";
268 newText = true;
269 lastspace = false;
270 handled = true;
271 }
272 else if (!strcmp(token, "/div")) {
273 //pushString(&to, divEnd.c_str());
274 text.append(divEnd);
275 lastspace = false;
276 handled = true;
277 }
278 // Scripture Reference
279 if (!strncmp(token, "scripRef", 8)) {
280 // pushString(buf, "<reference osisRef=\"");
281 suspendTextPassThru = true;
282 newText = true;
283 handled = true;
284 }
285 else if (!strncmp(token, "/scripRef", 9)) {
286 SWBuf tmp;
287 tmp = "";
288 tmp.append(textStart, (int)(textEnd - textStart)+1);
289 //pushString(&to, convertToOSIS(tmp.c_str(), key));
290 text.append(VerseKey::convertToOSIS(tmp.c_str(), key));
291 suspendTextPassThru = false;
292 handled = true;
293 }
294 // Usage of italics to represent transChange isn't domaninant;
295 // solution: mark in OSIS instead, assume no semantics other than emphasis
296 // of italicized text
297 // if (!strcmp(module->Type(), "Biblical Texts")) {
298 // // Italics assume transchange for Biblical texts
299 // if (!stricmp(token, "i")) {
300 // pushString(&to, "<transChange type=\"added\">");
301 // newText = true;
302 // lastspace = false;
303 // handled = true;
304 // }
305 // else if (!stricmp(token, "/i")) {
306 // pushString(&to, "</transChange>");
307 // lastspace = false;
308 // handled = true;
309 // }
310 // }
311 // else {
312 // // otherwise, italics are just italics
313 //-- end italics for transchange
314 if (!stricmp(token, "i")) {
315 // pushString(&to, "<hi type=\"i\">");
316 text.append("<hi type=\"i\">");
317 newText = true;
318 lastspace = false;
319 handled = true;
320 }
321 else if (!stricmp(token, "/i")) {
322 // pushString(&to, "</hi>");
323 text.append("</hi>");
324 lastspace = false;
325 handled = true;
326 }
327 // }
328
329 if (!strcmp(token, "b")) {
330 // pushString(&to, "<hi type=\"b\">");
331 text.append("<hi type=\"b\">");
332 newText = true;
333 lastspace = false;
334 handled = true;
335 }
336 else if (!strcmp(token, "/b")) {
337 // pushString(&to, "</hi>");
338 text.append("</hi>");
339 lastspace = false;
340 handled = true;
341 }
342
343 // Footnote
344 if (!strncmp(token, "note", 4)) {
345 //pushString(&to, "<note>");
346 text.append("<note>");
347 newText = true;
348 lastspace = false;
349 handled = true;
350 }
351 else if (!strcmp(token, "/note")) {
352 // pushString(&to, "</note>");
353 text.append("</note>");
354 lastspace = false;
355 handled = true;
356 }
357
358 // Figure
359 else if (!strncmp(token, "img ", 4)) {
360 const char *src = strstr(token, "src");
361 if (!src) // assert we have a src attribute
362 continue;
363 // return false;
364
365 //pushString(&to, "<figure src=\"");
366 text.append("<figure src=\"");
367
368 const char* end = strchr(src+2, '"'); //start search behind src="
369
370 if (end) { //append the path
371 text.append(src+2, end - (src+2));
372 }
373
374 // const char *c;
375 // for (c = src;((*c) && (*c != '"')); c++);
376
377 // uncomment for SWORD absolute path logic
378 // if (*(c+1) == '/') {
379 // pushString(buf, "file:");
380 // pushString(buf, module->getConfigEntry("AbsoluteDataPath"));
381 // if (*((*buf)-1) == '/')
382 // c++; // skip '/'
383 // }
384 // end of uncomment for asolute path logic
385
386 // for (c++;((*c) && (*c != '"')); c++)
387 // *to++ = *c;
388
389 //pushString(&to, "\" />");
390 text.append("\" />");
391 handled = true;
392 }
393
394 // Strongs numbers
395 else if (!strnicmp(token, "sync type=\"Strongs\" ", 20)) { // Strongs
396 valto = val;
397 for (unsigned int i = 27; token[i] != '\"' && i < 150; i++)
398 *valto++ = token[i];
399 *valto = 0;
400 if (atoi((!isdigit(*val))?val+1:val) < 5627) {
401 // normal strongs number
402 strstrip(val);
403
404 if (!strncmp(wordStart, "<w ", 3)) {
405 const char *attStart = strstr(wordStart, "lemma");
406 if (attStart) { //existing morph attribute, append this one to it
407 attStart += 7;
408 buf = "";
409 buf.appendFormatted("strong:%s ", val);
410 }
411 else { // no lemma attribute
412 attStart = wordStart + 3;
413 buf = "";
414 buf.appendFormatted(buf, "lemma=\"strong:%s\" ", val);
415 }
416
417 text.insert(attStart - text.c_str(), buf);
418 }
419 else { //wordStart doesn't point to an existing <w> attribute!
420 buf = "";
421 buf.appendFormatted("<w lemma=\"strong:%s\">", val);
422 text.insert(wordStart - text.c_str(), buf);
423 text += "</w>";
424 lastspace = false;
425 }
426 }
427 // OLB verb morph, leave it out of OSIS tag
428 else {
429 }
430 handled = true;
431 }
432
433 // Morphology
434 else if (!strncmp(token, "sync type=\"morph\"", 17)) {
435 SWBuf cls = "";
436 SWBuf morph = "";
437 for (ch = token+17; *ch; ch++) {
438 if (!strncmp(ch, "class=\"", 7)) {
439 valto = val;
440 for (unsigned int i = 7; ch[i] != '\"' && i < 127; i++)
441 *valto++ = ch[i];
442 *valto = 0;
443 strstrip(val);
444 cls = val;
445 }
446 if (!strncmp(ch, "value=\"", 7)) {
447 valto = val;
448 for (unsigned int i = 7; ch[i] != '\"' && i < 127; i++)
449 *valto++ = ch[i];
450 *valto = 0;
451 strstrip(val);
452 morph = val;
453 }
454 }
455 if (!strncmp(wordStart, "<w ", 3)) {
456 const char *attStart = strstr(wordStart, "morph");
457 if (attStart) { //existing morph attribute, append this one to it
458 attStart += 7;
459 buf = "";
460 buf.appendFormatted("%s:%s ", ((cls.length())?cls.c_str():"robinson"), morph.c_str());
461 }
462 else { // no lemma attribute
463 attStart = wordStart + 3;
464 buf = "";
465 buf.appendFormatted("morph=\"%s:%s\" ", ((cls.length())?cls.c_str():"robinson"), morph.c_str());
466 }
467
468 text.insert(attStart - text.c_str(), buf); //hack, we have to
469 }
470 else { //no existing <w> attribute fond
471 buf = "";
472 buf.appendFormatted("<w morph=\"%s:%s\">", ((cls.length())?cls.c_str():"robinson"), morph.c_str());
473 text.insert(wordStart - text.c_str(), buf);
474 text += "</w>";
475 lastspace = false;
476
477 }
478 handled = true;
479 }
480
481 if (!keepToken) {
482 if (!handled) {
483 SWLog::getSystemLog()->logError("Unprocessed Token: <%s> in key %s", token, key ? (const char*)*key : "<unknown>");
484 // exit(-1);
485 }
486 if (from[1] && strchr(" ,;.:?!()'\"", from[1])) {
487 if (lastspace) {
488 text--;
489 }
490 }
491 if (newText) {
492 textStart = from+1;
493 newText = false;
494 }
495 continue;
496 }
497
498 // if not a strongs token, keep token in text
499 text.appendFormatted("<%s>", token);
500
501 if (newText) {
502 textStart = text.c_str() + text.length();
503 newWord = false;
504 }
505 continue;
506 }
507 if (intoken) {
508 if ((tokpos < 2045) && ((*from != 10)&&(*from != 13))) {
509 token[tokpos++] = *from;
510 token[tokpos+2] = 0;
511 }
512 }
513 else {
514 switch (*from) {
515 case '\'':
516 case '\"':
517 case '`':
518 // quoteStack.handleQuote(fromStart, from, &to);
519 text += *from;
520 //from++; //this line removes chars after an apostrophe! Needs fixing.
521 break;
522 default:
523 if (newWord && (*from != ' ')) {
524 wordStart = text.c_str() + text.length();
525 newWord = false;
526
527 //fix this if required?
528 //memset(to, 0, 10);
529
530 }
531
532 if (!suspendTextPassThru) {
533 text += (*from);
534 lastspace = (*from == ' ');
535 }
536 }
537 }
538 }
539
540 VerseKey *vkey = SWDYNAMIC_CAST(VerseKey, key);
541 if (vkey) {
542 SWBuf ref = "";
543 if (vkey->getVerse()) {
544 ref.appendFormatted("\t\t<verse osisID=\"%s\">", vkey->getOSISRef());
545 }
546
547 if (ref.length() > 0) {
548
549 text = ref + text;
550
551 if (vkey->getVerse()) {
552 VerseKey *tmp = (VerseKey *)vkey->clone();
553 *tmp = *vkey;
554 tmp->setAutoNormalize(false);
555 tmp->setIntros(true);
556
557 text += "</verse>";
558
559 *tmp = MAXVERSE;
560 if (*vkey == *tmp) {
561 tmp->setVerse(0);
562 // sprintf(ref, "\t</div>");
563 // pushString(&to, ref);
564 *tmp = MAXCHAPTER;
565 *tmp = MAXVERSE;
566 if (*vkey == *tmp) {
567 tmp->setChapter(0);
568 tmp->setVerse(0);
569 // sprintf(ref, "\t</div>");
570 // pushString(&to, ref);
571 /*
572 if (!quoteStack.empty()) {
573 SWLog::getSystemLog()->logError("popping unclosed quote at end of book");
574 quoteStack.clear();
575 }
576 */
577 }
578 }
579 delete tmp;
580 }
581 // else if (vkey->getChapter()) {
582 // sprintf(ref, "\t<div type=\"chapter\" osisID=\"%s\">", vkey->getOSISRef());
583 // }
584 // else sprintf(ref, "\t<div type=\"book\" osisID=\"%s\">", vkey->getOSISRef());
585 }
586 }
587 return 0;
588 }
589
590
591 SWORD_NAMESPACE_END
592