1 // src/markup.cc
2 // This file is part of libpbe; see http://decimail.org
3 // (C) 2004 Philip Endecott
4
5 // This program is free software; you can redistribute it and/or modify
6 // it under the terms of the GNU General Public License as published by
7 // the Free Software Foundation; either version 2 of the License, or
8 // any later version.
9 //
10 // This program is distributed in the hope that it will be useful,
11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 // GNU General Public License for more details.
14 //
15 // You should have received a copy of the GNU General Public License
16 // along with this program; if not, write to the Free Software
17 // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18 #include "markup.hh"
19
20 #include "StringTransformer.hh"
21
22 #include <regex.h>
23 #include <assert.h>
24
25
26 class AtDotToWords: public StringTransformer {
27 public:
AtDotToWords()28 AtDotToWords() {
29 add_cs_rule('@'," AT ");
30 add_cs_rule('.'," DOT ");
31 }
32 };
33
34 static AtDotToWords at_dot_to_words;
35
36
render_email(string e)37 static string render_email(string e)
38 {
39 string h = "<span class=\"ofsce\">"+at_dot_to_words(e)+"</span>";
40 return h;
41 }
42
43
render_uri(string u)44 static string render_uri(string u)
45 {
46 string uri;
47 if (u.substr(0,4)!="http") {
48 uri = "http://" + u;
49 } else {
50 uri=u;
51 }
52 string h = "<a href=\"";
53 h += uri;
54 h += "\">";
55 h += u;
56 h += "</a>";
57 return h;
58 }
59
60
61 // See RFC2396 for URI syntax.
62
63 // We make the protocol prefix optional, so www.foo.com works.
64
65 // We require that the last component of the domain name is purely
66 // alphabetic and has at least two characters. This is true in
67 // practice (.com, .uk) and helps to avoid false positives ("e.g.").
68
69 // We require that the last character of a path segment is not a
70 // common punctuation character: .!):;, This is because
71 // these rarely fall at the end of a true URI but are often placed
72 // after a URI when it is written in text.
73
74 // We apply this after escaping & to &. This is only an issue in
75 // the path where ; normally has a special meaning which we have to
76 // ignore.
77
78 // We reuse the URI domain rules for emails.
79
80 // For email local parts we use the list of characters allowed by
81 // RFC2822, but allow adjacent .s.
82
83 // We disallow @ in URIs so that we can use it to distinguish emails
84 // from URIs. They should be allowed in paths, queries and
85 // fragment-ids.
86
87
88 const char* uri_regexp =
89 "(https?://)?" // optional protocol
90 "[-a-zA-Z0-9]+(\\.[-a-zA-Z0-9]+)*\\.[a-zA-Z][a-zA-Z]+" // hostname
91 "(:[0-9]+)?" // optional port number
92 "(/[-a-zA-Z0-9_.!~*'()%:&;=+$,]*[-a-zA-Z0-9_~*'(%&=+$])*" // path, may be empty
93 "/?" // path to directory may be terminated with a /
94 "(\\?[-;/?:&=+$,a-zA-Z0-9_.!~*'()]*)?" // optional queery
95 "(#[-;/?:&=+$,a-zA-Z0-9_.!~*'()]*)?" // optional fragment-id
96 ;
97
98 const char* email_regexp =
99 "[a-zA-Z0-9!#$%&'*+-/=?^_`{|}~.]+" // local-part
100 "@"
101 "[-a-zA-Z0-9]+(\\.[-a-zA-Z0-9]+)*\\.[a-zA-Z][a-zA-Z]+" // hostname
102 ;
103
104
markup_uris_emails(string & text)105 void markup_uris_emails ( string& text )
106 {
107 static bool re_compiled = false;
108 static regex_t re;
109 if (!re_compiled) {
110 string regexp = string("(") + uri_regexp + ")|(" + email_regexp +")";
111 int rc = regcomp(&re, regexp.c_str(), REG_EXTENDED);
112 assert(rc==0);
113 re_compiled=true;
114 }
115
116 string r;
117 regmatch_t match;
118
119 int pos=0;
120 while (1) {
121 int rc = regexec(&re, text.c_str()+pos, 1, &match, 0);
122 assert((rc==0) || (rc==REG_NOMATCH));
123 if (rc==REG_NOMATCH) {
124 r.append(text.substr(pos));
125 break;
126 }
127 r += text.substr(pos,match.rm_so);
128 string p = text.substr(pos+match.rm_so,match.rm_eo-match.rm_so);
129 if (p.find('@')!=p.npos) {
130 r += render_email(p);
131 } else {
132 r += render_uri(p);
133 }
134 pos += match.rm_eo;
135 }
136
137 text = r;
138 }
139