1 /* Copyright (C) 2005 J.F.Dockes
2  *   This program is free software; you can redistribute it and/or modify
3  *   it under the terms of the GNU General Public License as published by
4  *   the Free Software Foundation; either version 2 of the License, or
5  *   (at your option) any later version.
6  *
7  *   This program is distributed in the hope that it will be useful,
8  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
9  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10  *   GNU General Public License for more details.
11  *
12  *   You should have received a copy of the GNU General Public License
13  *   along with this program; if not, write to the
14  *   Free Software Foundation, Inc.,
15  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
16  */
17 #include "autoconfig.h"
18 
19 #include <limits.h>
20 #include <string>
21 #include <utility>
22 #include <list>
23 #include <set>
24 #include <vector>
25 #include <unordered_map>
26 #include <algorithm>
27 #include <regex>
28 
29 using std::vector;
30 using std::list;
31 using std::pair;
32 using std::set;
33 using std::unordered_map;
34 
35 #include "rcldb.h"
36 #include "rclconfig.h"
37 #include "log.h"
38 #include "textsplit.h"
39 #include "utf8iter.h"
40 #include "smallut.h"
41 #include "chrono.h"
42 #include "plaintorich.h"
43 #include "cancelcheck.h"
44 #include "unacpp.h"
45 
46 // Text splitter used to take note of the position of query terms
47 // inside the result text. This is then used to insert highlight tags.
48 class TextSplitPTR : public TextSplit {
49 public:
50 
51     // Out: begin and end byte positions of query terms/groups in text
52     vector<GroupMatchEntry> m_tboffs;
53 
TextSplitPTR(const HighlightData & hdata)54     TextSplitPTR(const HighlightData& hdata)
55         :  m_wcount(0), m_hdata(hdata) {
56         // We separate single terms and groups and extract the group
57         // terms for computing positions list before looking for group
58         // matches. Single terms are stored with a reference to the
59         // entry they come with.
60         for (unsigned int i = 0; i < hdata.index_term_groups.size(); i++) {
61             const HighlightData::TermGroup& tg(hdata.index_term_groups[i]);
62             if (tg.kind == HighlightData::TermGroup::TGK_TERM) {
63                 m_terms[tg.term] = i;
64             } else {
65                 for (const auto& group : tg.orgroups) {
66                     for (const auto& term : group) {
67                         m_gterms.insert(term);
68                     }
69                 }
70             }
71         }
72     }
73 
74     // Accept word and its position. If word is search term, add
75     // highlight zone definition. If word is part of search group
76     // (phrase or near), update positions list.
takeword(const std::string & term,int pos,int bts,int bte)77     virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
78         string dumb = term;
79         if (o_index_stripchars) {
80             if (!unacmaybefold(term, dumb, "UTF-8", UNACOP_UNACFOLD)) {
81                 LOGINFO("PlainToRich::takeword: unac failed for [" << term <<
82                         "]\n");
83                 return true;
84             }
85         }
86 
87         LOGDEB2("Input dumbbed term: '" << dumb << "' " <<  pos << " " << bts
88                 << " " << bte << "\n");
89 
90         // If this word is a search term, remember its byte-offset span.
91         map<string, size_t>::const_iterator it = m_terms.find(dumb);
92         if (it != m_terms.end()) {
93             m_tboffs.push_back(GroupMatchEntry(bts, bte, it->second));
94         }
95 
96         // If word is part of a search group, update its positions list
97         if (m_gterms.find(dumb) != m_gterms.end()) {
98             // Term group (phrase/near) handling
99             m_plists[dumb].push_back(pos);
100             m_gpostobytes[pos] = pair<int,int>(bts, bte);
101             LOGDEB2("Recorded bpos for " << pos << ": " << bts << " " <<
102                     bte << "\n");
103         }
104 
105         // Check for cancellation request
106         if ((m_wcount++ & 0xfff) == 0)
107             CancelCheck::instance().checkCancel();
108 
109         return true;
110     }
111 
112     // Must be called after the split to find the phrase/near match positions
113     virtual bool matchGroups();
114 
115 private:
116     // Word count. Used to call checkCancel from time to time.
117     int m_wcount;
118 
119     // In: user query terms
120     map<string, size_t>    m_terms;
121 
122     // m_gterms holds all the terms in m_groups, as a set for quick lookup
123     set<string>    m_gterms;
124 
125     const HighlightData& m_hdata;
126 
127     // group/near terms word positions.
128     unordered_map<string, vector<int> > m_plists;
129     unordered_map<int, pair<int, int> > m_gpostobytes;
130 };
131 
132 
133 // Look for matches to PHRASE and NEAR term groups and finalize the
134 // matched regions list (sort it by increasing start then decreasing
135 // length)
matchGroups()136 bool TextSplitPTR::matchGroups()
137 {
138     for (unsigned int i = 0; i < m_hdata.index_term_groups.size(); i++) {
139         if (m_hdata.index_term_groups[i].kind !=
140             HighlightData::TermGroup::TGK_TERM) {
141             matchGroup(m_hdata, i, m_plists, m_gpostobytes, m_tboffs);
142         }
143     }
144 
145     // Sort regions by increasing start and decreasing width.
146     // The output process will skip overlapping entries.
147     std::sort(m_tboffs.begin(), m_tboffs.end(),
148               [](const GroupMatchEntry& a, const GroupMatchEntry& b) -> bool {
149                   if (a.offs.first != b.offs.first)
150                       return a.offs.first < b.offs.first;
151                   return a.offs.second > b.offs.second;
152               }
153         );
154     return true;
155 }
156 
157 #ifndef NO_STD_REGEX
158 // Replace HTTP(s) urls in text/plain with proper HTML anchors so that
159 // they become clickable in the preview. We don't make a lot of effort
160 // for validating, or catching things which are probably urls but miss
161 // a scheme (e.g. www.xxx.com/index.html), because complicated.
162 static const string urlRE = "(https?://[[:alnum:]~_/.%?&=,#@]+)[[:space:]|]";
163 static const string urlRep{"<a href=\"$1\">$1</a>"};
164 static std::regex url_re(urlRE);
activate_urls(const string & in)165 static string activate_urls(const string& in)
166 {
167     return std::regex_replace(in, url_re, urlRep);
168 }
169 #else
activate_urls(const string & in)170 static string activate_urls(const string& in)
171 {
172     return in;
173 }
174 #endif
175 
176 // Fix result text for display inside the gui text window.
177 //
178 // We call overridden functions to output header data, beginnings and ends of
179 // matches etc.
180 //
181 // If the input is text, we output the result in chunks, arranging not
182 // to cut in the middle of a tag, which would confuse qtextedit. If
183 // the input is html, the body is always a single output chunk.
plaintorich(const string & in,list<string> & out,const HighlightData & hdata,int chunksize)184 bool PlainToRich::plaintorich(const string& in,
185                               list<string>& out, // Output chunk list
186                               const HighlightData& hdata,
187                               int chunksize)
188 {
189     Chrono chron;
190     bool ret = true;
191     LOGDEB1("plaintorichich: in: [" << in << "]\n");
192 
193     m_hdata = &hdata;
194     // Compute the positions for the query terms.  We use the text
195     // splitter to break the text into words, and compare the words to
196     // the search terms,
197     TextSplitPTR splitter(hdata);
198     // Note: the splitter returns the term locations in byte, not
199     // character, offsets.
200     splitter.text_to_words(in);
201     LOGDEB2("plaintorich: split done " << chron.millis() << " mS\n");
202     // Compute the positions for NEAR and PHRASE groups.
203     splitter.matchGroups();
204     LOGDEB2("plaintorich: group match done " << chron.millis() << " mS\n");
205 
206     out.clear();
207     out.push_back("");
208     list<string>::iterator olit = out.begin();
209 
210     // Rich text output
211     *olit = header();
212 
213     // No term matches. Happens, for example on a snippet selected for
214     // a term match when we are actually looking for a group match
215     // (the snippet generator does this...).
216     if (splitter.m_tboffs.empty()) {
217         LOGDEB1("plaintorich: no term matches\n");
218         ret = false;
219     }
220 
221     // Iterator for the list of input term positions. We use it to
222     // output highlight tags and to compute term positions in the
223     // output text
224     vector<GroupMatchEntry>::iterator tPosIt = splitter.m_tboffs.begin();
225     vector<GroupMatchEntry>::iterator tPosEnd = splitter.m_tboffs.end();
226 
227 #if 0
228     for (vector<pair<int, int> >::const_iterator it = splitter.m_tboffs.begin();
229          it != splitter.m_tboffs.end(); it++) {
230         LOGDEB2("plaintorich: region: " << it->first << " "<<it->second<< "\n");
231     }
232 #endif
233 
234     // Input character iterator
235     Utf8Iter chariter(in);
236 
237     // State variables used to limit the number of consecutive empty lines,
238     // convert all eol to '\n', and preserve some indentation
239     int eol = 0;
240     int hadcr = 0;
241     int inindent = 1;
242 
243     // HTML state
244     bool intag = false, inparamvalue = false;
245     // My tag state
246     int inrcltag = 0;
247 
248     string::size_type headend = 0;
249     if (m_inputhtml) {
250         headend = in.find("</head>");
251         if (headend == string::npos)
252             headend = in.find("</HEAD>");
253         if (headend != string::npos)
254             headend += 7;
255     }
256 
257     for (string::size_type pos = 0; pos != string::npos; pos = chariter++) {
258         // Check from time to time if we need to stop
259         if ((pos & 0xfff) == 0) {
260             CancelCheck::instance().checkCancel();
261         }
262 
263         // If we still have terms positions, check (byte) position. If
264         // we are at or after a term match, mark.
265         if (tPosIt != tPosEnd) {
266             int ibyteidx = int(chariter.getBpos());
267             if (ibyteidx == tPosIt->offs.first) {
268                 if (!intag && ibyteidx >= (int)headend) {
269                     *olit += startMatch((unsigned int)(tPosIt->grpidx));
270                 }
271                 inrcltag = 1;
272             } else if (ibyteidx == tPosIt->offs.second) {
273                 // Output end of match region tags
274                 if (!intag && ibyteidx > (int)headend) {
275                     *olit += endMatch();
276                 }
277                 // Skip all highlight areas that would overlap this one
278                 int crend = tPosIt->offs.second;
279                 while (tPosIt != splitter.m_tboffs.end() &&
280                        tPosIt->offs.first < crend)
281                     tPosIt++;
282                 inrcltag = 0;
283             }
284         }
285 
286         unsigned int car = *chariter;
287 
288         if (car == '\n') {
289             if (!hadcr)
290                 eol++;
291             hadcr = 0;
292             continue;
293         } else if (car == '\r') {
294             hadcr++;
295             eol++;
296             continue;
297         } else if (eol) {
298             // Got non eol char in line break state. Do line break;
299             inindent = 1;
300             hadcr = 0;
301             if (eol > 2)
302                 eol = 2;
303             while (eol) {
304                 if (!m_inputhtml && m_eolbr)
305                     *olit += "<br>";
306                 *olit += "\n";
307                 eol--;
308             }
309             // Maybe end this chunk, begin next. Don't do it on html
310             // there is just no way to do it right (qtextedit cant grok
311             // chunks cut in the middle of <a></a> for example).
312             if (!m_inputhtml && !inrcltag &&
313                 olit->size() > (unsigned int)chunksize) {
314                 if (m_activatelinks) {
315                     *olit = activate_urls(*olit);
316                 }
317                 out.push_back(string(startChunk()));
318                 olit++;
319             }
320         }
321 
322         switch (car) {
323         case '<':
324             inindent = 0;
325             if (m_inputhtml) {
326                 if (!inparamvalue)
327                     intag = true;
328                 chariter.appendchartostring(*olit);
329             } else {
330                 *olit += "&lt;";
331             }
332             break;
333         case '>':
334             inindent = 0;
335             if (m_inputhtml) {
336                 if (!inparamvalue)
337                     intag = false;
338             }
339             chariter.appendchartostring(*olit);
340             break;
341         case '&':
342             inindent = 0;
343             if (m_inputhtml) {
344                 chariter.appendchartostring(*olit);
345             } else {
346                 *olit += "&amp;";
347             }
348             break;
349         case '"':
350             inindent = 0;
351             if (m_inputhtml && intag) {
352                 inparamvalue = !inparamvalue;
353             }
354             chariter.appendchartostring(*olit);
355             break;
356 
357         case ' ':
358             if (m_eolbr && inindent) {
359                 *olit += "&nbsp;";
360             } else {
361                 chariter.appendchartostring(*olit);
362             }
363             break;
364         case '\t':
365             if (m_eolbr && inindent) {
366                 *olit += "&nbsp;&nbsp;&nbsp;&nbsp;";
367             } else {
368                 chariter.appendchartostring(*olit);
369             }
370             break;
371 
372         default:
373             inindent = 0;
374             chariter.appendchartostring(*olit);
375         }
376 
377     } // End chariter loop
378 
379 #if 0
380     {
381         FILE *fp = fopen("/tmp/debugplaintorich", "a");
382         fprintf(fp, "BEGINOFPLAINTORICHOUTPUT\n");
383         for (list<string>::iterator it = out.begin();
384              it != out.end(); it++) {
385             fprintf(fp, "BEGINOFPLAINTORICHCHUNK\n");
386             fprintf(fp, "%s", it->c_str());
387             fprintf(fp, "ENDOFPLAINTORICHCHUNK\n");
388         }
389         fprintf(fp, "ENDOFPLAINTORICHOUTPUT\n");
390         fclose(fp);
391     }
392 #endif
393     LOGDEB2("plaintorich: done " << chron.millis() << " mS\n");
394     if (!m_inputhtml && m_activatelinks) {
395         out.back() = activate_urls(out.back());
396     }
397     return ret;
398 }
399