1 /******************************************************************************
2 *
3 * osisplain.cpp - An SWFilter that provides stripping of OSIS tags
4 *
5 * $Id: osisplain.cpp 3548 2017-12-10 05:11:38Z scribe $
6 *
7 * Copyright 2003-2013 CrossWire Bible Society (http://www.crosswire.org)
8 * CrossWire Bible Society
9 * P. O. Box 2528
10 * Tempe, AZ 85280-2528
11 *
12 * This program is free software; you can redistribute it and/or modify it
13 * under the terms of the GNU General Public License as published by the
14 * Free Software Foundation version 2.
15 *
16 * This program is distributed in the hope that it will be useful, but
17 * WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 */
22
23 #include <stdlib.h>
24 #include <osisplain.h>
25 #include <ctype.h>
26 #include <versekey.h>
27 #include <stringmgr.h>
28 #include <utilxml.h>
29 #include <swmodule.h>
30
31 SWORD_NAMESPACE_START
32
33
34 namespace {
35
36 class MyUserData : public BasicFilterUserData {
37 public:
38 SWBuf w;
39 XMLTag tag;
40 char testament;
41 SWBuf hiType;
MyUserData(const SWModule * module,const SWKey * key)42 MyUserData(const SWModule *module, const SWKey *key) : BasicFilterUserData(module, key) {}
43 };
44 }
45
46
OSISPlain()47 OSISPlain::OSISPlain() {
48 setTokenStart("<");
49 setTokenEnd(">");
50
51 setEscapeStart("&");
52 setEscapeEnd(";");
53
54 setEscapeStringCaseSensitive(true);
55
56 addEscapeStringSubstitute("amp", "&");
57 addEscapeStringSubstitute("apos", "'");
58 addEscapeStringSubstitute("lt", "<");
59 addEscapeStringSubstitute("gt", ">");
60 addEscapeStringSubstitute("quot", "\"");
61
62 setTokenCaseSensitive(true);
63 addTokenSubstitute("title", "\n");
64 addTokenSubstitute("/title", "\n");
65 addTokenSubstitute("/l", "\n");
66 addTokenSubstitute("lg", "\n");
67 addTokenSubstitute("/lg", "\n");
68
69 setStageProcessing(PRECHAR);
70 }
71
72
createUserData(const SWModule * module,const SWKey * key)73 BasicFilterUserData *OSISPlain::createUserData(const SWModule *module, const SWKey *key) {
74 MyUserData *u = new MyUserData(module, key);
75 u->testament = (u->vkey) ? u->vkey->getTestament() : 2; // default to NT
76 return u;
77 }
78
79
processStage(char stage,SWBuf & text,char * & from,BasicFilterUserData * userData)80 bool OSISPlain::processStage(char stage, SWBuf &text, char *&from, BasicFilterUserData *userData) {
81 // this is a strip filter so we want to do this as optimized as possible. Avoid calling
82 // getUniCharFromUTF8 for slight speed improvement
83
84 if (stage == PRECHAR) {
85 if (from[0] == 0xC2 && from[1] == 0xAD) return true; // skip soft hyphens
86 }
87 return false;
88 }
89
90
handleToken(SWBuf & buf,const char * token,BasicFilterUserData * userData)91 bool OSISPlain::handleToken(SWBuf &buf, const char *token, BasicFilterUserData *userData) {
92 // manually process if it wasn't a simple substitution
93 if (!substituteToken(buf, token)) {
94 MyUserData *u = (MyUserData *)userData;
95 if (((*token == 'w') && (token[1] == ' ')) ||
96 ((*token == '/') && (token[1] == 'w') && (!token[2]))) {
97 u->tag = token;
98
99 bool start = false;
100 if (*token == 'w') {
101 if (token[strlen(token)-1] != '/') {
102 u->w = token;
103 return true;
104 }
105 start = true;
106 }
107 u->tag = (start) ? token : u->w.c_str();
108 bool show = true; // to handle unplaced article in kjv2003-- temporary till combined
109
110 SWBuf lastText = (start) ? "stuff" : u->lastTextNode.c_str();
111
112 const char *attrib;
113 const char *val;
114 if ((attrib = u->tag.getAttribute("xlit"))) {
115 val = strchr(attrib, ':');
116 val = (val) ? (val + 1) : attrib;
117 buf.append(" <");
118 buf.append(val);
119 buf.append('>');
120 }
121 if ((attrib = u->tag.getAttribute("gloss"))) {
122 buf.append(" <");
123 buf.append(attrib);
124 buf.append('>');
125 }
126 if ((attrib = u->tag.getAttribute("lemma"))) {
127 int count = u->tag.getAttributePartCount("lemma", ' ');
128 int i = (count > 1) ? 0 : -1; // -1 for whole value cuz it's faster, but does the same thing as 0
129 do {
130 char gh;
131 attrib = u->tag.getAttribute("lemma", i, ' ');
132 if (i < 0) i = 0; // to handle our -1 condition
133 val = strchr(attrib, ':');
134 val = (val) ? (val + 1) : attrib;
135 if ((strchr("GH", *val)) && (isdigit(val[1]))) {
136 gh = *val;
137 val++;
138 }
139 else {
140 gh = (u->testament>1) ? 'G' : 'H';
141 }
142 if ((!strcmp(val, "3588")) && (lastText.length() < 1))
143 show = false;
144 else {
145 buf.append(" <");
146 buf.append(gh);
147 buf.append(val);
148 buf.append(">");
149 }
150 } while (++i < count);
151 }
152 if ((attrib = u->tag.getAttribute("morph")) && (show)) {
153 int count = u->tag.getAttributePartCount("morph", ' ');
154 int i = (count > 1) ? 0 : -1; // -1 for whole value cuz it's faster, but does the same thing as 0
155 do {
156 attrib = u->tag.getAttribute("morph", i, ' ');
157 if (i < 0) i = 0; // to handle our -1 condition
158 val = strchr(attrib, ':');
159 val = (val) ? (val + 1) : attrib;
160 if ((*val == 'T') && (strchr("GH", val[1])) && (isdigit(val[2])))
161 val+=2;
162 buf.append(" (");
163 buf.append(val);
164 buf.append(')');
165 } while (++i < count);
166 }
167 if ((attrib = u->tag.getAttribute("POS"))) {
168 val = strchr(attrib, ':');
169 val = (val) ? (val + 1) : attrib;
170
171 buf.append(" <");
172 buf.append(val);
173 buf.append('>');
174 }
175 }
176
177 // <note> tag
178 else if (!strncmp(token, "note", 4)) {
179 if (!strstr(token, "strongsMarkup")) { // leave strong's markup notes out, in the future we'll probably have different option filters to turn different note types on or off
180 buf.append(" [");
181 }
182 else u->suspendTextPassThru = true;
183 if (u->module) {
184 XMLTag tag = token;
185 SWBuf swordFootnote = tag.getAttribute("swordFootnote");
186 SWBuf footnoteBody = u->module->getEntryAttributes()["Footnote"][swordFootnote]["body"];
187 buf.append(u->module->renderText(footnoteBody));
188 }
189 }
190 else if (!strncmp(token, "/note", 5)) {
191 if (!u->suspendTextPassThru)
192 buf.append("] ");
193 else u->suspendTextPassThru = false;
194 }
195
196 // <p> paragraph tag
197 else if (((*token == 'p') && ((token[1] == ' ') || (!token[1]))) ||
198 ((*token == '/') && (token[1] == 'p') && (!token[2]))) {
199 userData->supressAdjacentWhitespace = true;
200 buf.append('\n');
201 }
202
203 // Milestoned paragraph, created by osis2mod
204 // <div type="paragraph" sID... />
205 // <div type="paragraph" eID... />
206 else if (!strcmp(u->tag.getName(), "div") && u->tag.getAttribute("type") && (!strcmp(u->tag.getAttribute("type"), "x-p") || !strcmp(u->tag.getAttribute("type"), "paragraph")) &&
207 (u->tag.isEmpty() && (u->tag.getAttribute("sID") || u->tag.getAttribute("eID")))) {
208 userData->supressAdjacentWhitespace = true;
209 buf.append('\n');
210 }
211
212 // <lb .../>
213 else if (!strncmp(token, "lb", 2)) {
214 userData->supressAdjacentWhitespace = true;
215 buf.append('\n');
216 }
217 else if (!strncmp(token, "l", 1) && strstr(token, "eID")) {
218 userData->supressAdjacentWhitespace = true;
219 buf.append('\n');
220 }
221 else if (!strncmp(token, "/divineName", 11)) {
222 // Get the end portion of the string, and upper case it
223 char* end = buf.getRawData();
224 end += buf.size() - u->lastTextNode.size();
225 toupperstr(end);
226 }
227 else if (!strncmp(token, "hi", 2)) {
228
229 // handle both OSIS 'type' and TEI 'rend' attributes
230 // there is no officially supported OSIS overline attribute,
231 // thus either TEI overline or OSIS x-overline would be best,
232 // but we have used "ol" in the past, as well. Once a valid
233 // OSIS overline attribute is made available, these should all
234 // eventually be deprecated and never documented that they are supported.
235 if (strstr(token, "rend=\"ol\"") || strstr(token, "rend=\"x-overline\"") || strstr(token, "rend=\"overline\"")
236 || strstr(token, "type=\"ol\"") || strstr(token, "type=\"x-overline\"") || strstr(token, "type=\"overline\"")) {
237 u->hiType = "overline";
238 }
239 else u->hiType = "";
240 u->suspendTextPassThru = true;
241 }
242 else if (!strncmp(token, "/hi", 3)) {
243 if (u->hiType == "overline") {
244 const unsigned char *b = (const unsigned char *)u->lastTextNode.c_str();
245 while (*b) {
246 const unsigned char *o = b;
247 if (getUniCharFromUTF8(&b)) {
248 while (o != b) buf.append(*(o++));
249 buf.append((unsigned char)0xCC);
250 buf.append((unsigned char)0x85);
251 }
252 }
253 }
254 else {
255 buf.append("* ");
256 buf.append(u->lastSuspendSegment);
257 buf.append(" *");
258 }
259 u->suspendTextPassThru = false;
260 }
261
262 else if ((!strncmp(token, "q", 1) && (u->tag.getAttribute("marker")))) {
263 buf.append(u->tag.getAttribute("marker"));
264 }
265
266
267 // <milestone type="line"/>
268 else if (!strncmp(token, "milestone", 9)) {
269 const char* type = strstr(token+10, "type=\"");
270 if (type && strncmp(type+6, "line", 4)) { //we check for type != line
271 userData->supressAdjacentWhitespace = true;
272 buf.append('\n');
273 }
274 if (u->tag.getAttribute("marker")) {
275 buf.append(u->tag.getAttribute("marker"));
276 }
277 }
278
279 else {
280 return false; // we still didn't handle token
281 }
282 }
283 return true;
284 }
285
286
287 SWORD_NAMESPACE_END
288