1 /******************************************************************************
2  *
3  *  osisplain.cpp -	An SWFilter that provides stripping of OSIS tags
4  *
5  * $Id: osisplain.cpp 3548 2017-12-10 05:11:38Z scribe $
6  *
7  * Copyright 2003-2013 CrossWire Bible Society (http://www.crosswire.org)
8  *	CrossWire Bible Society
9  *	P. O. Box 2528
10  *	Tempe, AZ  85280-2528
11  *
12  * This program is free software; you can redistribute it and/or modify it
13  * under the terms of the GNU General Public License as published by the
14  * Free Software Foundation version 2.
15  *
16  * This program is distributed in the hope that it will be useful, but
17  * WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * General Public License for more details.
20  *
21  */
22 
23 #include <stdlib.h>
24 #include <osisplain.h>
25 #include <ctype.h>
26 #include <versekey.h>
27 #include <stringmgr.h>
28 #include <utilxml.h>
29 #include <swmodule.h>
30 
31 SWORD_NAMESPACE_START
32 
33 
34 namespace {
35 
36 	class MyUserData : public BasicFilterUserData {
37 	public:
38 		SWBuf w;
39 		XMLTag tag;
40 		char testament;
41 		SWBuf hiType;
MyUserData(const SWModule * module,const SWKey * key)42 		MyUserData(const SWModule *module, const SWKey *key) : BasicFilterUserData(module, key) {}
43 	};
44 }
45 
46 
OSISPlain()47 OSISPlain::OSISPlain() {
48 	setTokenStart("<");
49 	setTokenEnd(">");
50 
51 	setEscapeStart("&");
52 	setEscapeEnd(";");
53 
54 	setEscapeStringCaseSensitive(true);
55 
56 	addEscapeStringSubstitute("amp", "&");
57 	addEscapeStringSubstitute("apos", "'");
58 	addEscapeStringSubstitute("lt", "<");
59 	addEscapeStringSubstitute("gt", ">");
60 	addEscapeStringSubstitute("quot", "\"");
61 
62 	setTokenCaseSensitive(true);
63 	addTokenSubstitute("title", "\n");
64 	addTokenSubstitute("/title", "\n");
65 	addTokenSubstitute("/l", "\n");
66 	addTokenSubstitute("lg", "\n");
67 	addTokenSubstitute("/lg", "\n");
68 
69 	setStageProcessing(PRECHAR);
70 }
71 
72 
createUserData(const SWModule * module,const SWKey * key)73 BasicFilterUserData *OSISPlain::createUserData(const SWModule *module, const SWKey *key) {
74 	MyUserData *u = new MyUserData(module, key);
75 	u->testament = (u->vkey) ? u->vkey->getTestament() : 2;	// default to NT
76 	return u;
77 }
78 
79 
processStage(char stage,SWBuf & text,char * & from,BasicFilterUserData * userData)80 bool OSISPlain::processStage(char stage, SWBuf &text, char *&from, BasicFilterUserData *userData) {
81 	// this is a strip filter so we want to do this as optimized as possible.  Avoid calling
82 	// getUniCharFromUTF8 for slight speed improvement
83 
84 	if (stage == PRECHAR) {
85 		if (from[0] == 0xC2 && from[1] == 0xAD) return true;	// skip soft hyphens
86 	}
87 	return false;
88 }
89 
90 
handleToken(SWBuf & buf,const char * token,BasicFilterUserData * userData)91 bool OSISPlain::handleToken(SWBuf &buf, const char *token, BasicFilterUserData *userData) {
92 	   // manually process if it wasn't a simple substitution
93 	if (!substituteToken(buf, token)) {
94 		MyUserData *u = (MyUserData *)userData;
95 		if (((*token == 'w') && (token[1] == ' ')) ||
96 		    ((*token == '/') && (token[1] == 'w') && (!token[2]))) {
97 				 u->tag = token;
98 
99 			bool start = false;
100 			if (*token == 'w') {
101 				if (token[strlen(token)-1] != '/') {
102 					u->w = token;
103 					return true;
104 				}
105 				start = true;
106 			}
107 			u->tag = (start) ? token : u->w.c_str();
108 			bool show = true;	// to handle unplaced article in kjv2003-- temporary till combined
109 
110 			SWBuf lastText = (start) ? "stuff" : u->lastTextNode.c_str();
111 
112 			const char *attrib;
113 			const char *val;
114 			if ((attrib = u->tag.getAttribute("xlit"))) {
115 				val = strchr(attrib, ':');
116 				val = (val) ? (val + 1) : attrib;
117 				buf.append(" <");
118 				buf.append(val);
119 				buf.append('>');
120 			}
121 			if ((attrib = u->tag.getAttribute("gloss"))) {
122 				buf.append(" <");
123 				buf.append(attrib);
124 				buf.append('>');
125 			}
126 			if ((attrib = u->tag.getAttribute("lemma"))) {
127 				int count = u->tag.getAttributePartCount("lemma", ' ');
128 				int i = (count > 1) ? 0 : -1;		// -1 for whole value cuz it's faster, but does the same thing as 0
129 				do {
130 					char gh;
131 					attrib = u->tag.getAttribute("lemma", i, ' ');
132 					if (i < 0) i = 0;	// to handle our -1 condition
133 					val = strchr(attrib, ':');
134 					val = (val) ? (val + 1) : attrib;
135 					if ((strchr("GH", *val)) && (isdigit(val[1]))) {
136 						gh = *val;
137 						val++;
138 					}
139 					else {
140 						gh = (u->testament>1) ? 'G' : 'H';
141 					}
142 					if ((!strcmp(val, "3588")) && (lastText.length() < 1))
143 						show = false;
144 					else	{
145 						buf.append(" <");
146 						buf.append(gh);
147 						buf.append(val);
148 						buf.append(">");
149 					}
150 				} while (++i < count);
151 			}
152 			if ((attrib = u->tag.getAttribute("morph")) && (show)) {
153 				int count = u->tag.getAttributePartCount("morph", ' ');
154 				int i = (count > 1) ? 0 : -1;		// -1 for whole value cuz it's faster, but does the same thing as 0
155 				do {
156 					attrib = u->tag.getAttribute("morph", i, ' ');
157 					if (i < 0) i = 0;	// to handle our -1 condition
158 					val = strchr(attrib, ':');
159 					val = (val) ? (val + 1) : attrib;
160 					if ((*val == 'T') && (strchr("GH", val[1])) && (isdigit(val[2])))
161 						val+=2;
162 					buf.append(" (");
163 					buf.append(val);
164 					buf.append(')');
165 				} while (++i < count);
166 			}
167 			if ((attrib = u->tag.getAttribute("POS"))) {
168 				val = strchr(attrib, ':');
169 				val = (val) ? (val + 1) : attrib;
170 
171 				buf.append(" <");
172 				buf.append(val);
173 				buf.append('>');
174 			}
175 		}
176 
177 		// <note> tag
178 		else if (!strncmp(token, "note", 4)) {
179 				if (!strstr(token, "strongsMarkup")) {	// leave strong's markup notes out, in the future we'll probably have different option filters to turn different note types on or off
180 					buf.append(" [");
181 				}
182 				else	u->suspendTextPassThru = true;
183 				if (u->module) {
184 					XMLTag tag = token;
185 					SWBuf swordFootnote = tag.getAttribute("swordFootnote");
186 					SWBuf footnoteBody = u->module->getEntryAttributes()["Footnote"][swordFootnote]["body"];
187 					buf.append(u->module->renderText(footnoteBody));
188 				}
189 			}
190 		else if (!strncmp(token, "/note", 5)) {
191 			if (!u->suspendTextPassThru)
192 				buf.append("] ");
193 			else	u->suspendTextPassThru = false;
194 		}
195 
196 		// <p> paragraph tag
197 		else if (((*token == 'p') && ((token[1] == ' ') || (!token[1]))) ||
198 			((*token == '/') && (token[1] == 'p') && (!token[2]))) {
199 				userData->supressAdjacentWhitespace = true;
200 				buf.append('\n');
201 		}
202 
203 		// Milestoned paragraph, created by osis2mod
204 		// <div type="paragraph"  sID... />
205 		// <div type="paragraph"  eID... />
206 		else if (!strcmp(u->tag.getName(), "div") && u->tag.getAttribute("type") && (!strcmp(u->tag.getAttribute("type"), "x-p") || !strcmp(u->tag.getAttribute("type"), "paragraph")) &&
207 			(u->tag.isEmpty() && (u->tag.getAttribute("sID") || u->tag.getAttribute("eID")))) {
208 				userData->supressAdjacentWhitespace = true;
209 				buf.append('\n');
210 		}
211 
212                 // <lb .../>
213                 else if (!strncmp(token, "lb", 2)) {
214 			userData->supressAdjacentWhitespace = true;
215 			buf.append('\n');
216 		}
217 		else if (!strncmp(token, "l", 1) && strstr(token, "eID")) {
218 			userData->supressAdjacentWhitespace = true;
219 			buf.append('\n');
220 		}
221 		else if (!strncmp(token, "/divineName", 11)) {
222 			// Get the end portion of the string, and upper case it
223 			char* end = buf.getRawData();
224 			end += buf.size() - u->lastTextNode.size();
225 			toupperstr(end);
226 		}
227 		else if (!strncmp(token, "hi", 2)) {
228 
229 				// handle both OSIS 'type' and TEI 'rend' attributes
230 				// there is no officially supported OSIS overline attribute,
231 				// thus either TEI overline or OSIS x-overline would be best,
232 				// but we have used "ol" in the past, as well.  Once a valid
233 				// OSIS overline attribute is made available, these should all
234 				// eventually be deprecated and never documented that they are supported.
235 				if (strstr(token, "rend=\"ol\"") || strstr(token, "rend=\"x-overline\"") || strstr(token, "rend=\"overline\"")
236 				   || strstr(token, "type=\"ol\"") || strstr(token, "type=\"x-overline\"") || strstr(token, "type=\"overline\"")) {
237 					u->hiType = "overline";
238 				}
239 				else u->hiType = "";
240 				u->suspendTextPassThru = true;
241 			}
242 		else if (!strncmp(token, "/hi", 3)) {
243 			if (u->hiType == "overline") {
244 				const unsigned char *b = (const unsigned char *)u->lastTextNode.c_str();
245 				while (*b) {
246 					const unsigned char *o = b;
247 					if (getUniCharFromUTF8(&b)) {
248 						while (o != b) buf.append(*(o++));
249 						buf.append((unsigned char)0xCC);
250 						buf.append((unsigned char)0x85);
251 					}
252 				}
253 			}
254 			else {
255 				buf.append("* ");
256 				buf.append(u->lastSuspendSegment);
257 				buf.append(" *");
258 			}
259 			u->suspendTextPassThru = false;
260 		}
261 
262 		else if ((!strncmp(token, "q", 1) && (u->tag.getAttribute("marker")))) {
263 			buf.append(u->tag.getAttribute("marker"));
264 			}
265 
266 
267                 // <milestone type="line"/>
268                 else if (!strncmp(token, "milestone", 9)) {
269 			const char* type = strstr(token+10, "type=\"");
270 			if (type && strncmp(type+6, "line", 4)) { //we check for type != line
271 				userData->supressAdjacentWhitespace = true;
272         			buf.append('\n');
273 			}
274 			if (u->tag.getAttribute("marker")) {
275 				buf.append(u->tag.getAttribute("marker"));
276 			}
277                 }
278 
279 		else {
280 			return false;  // we still didn't handle token
281 		}
282 	}
283 	return true;
284 }
285 
286 
287 SWORD_NAMESPACE_END
288