1 // src/markup.cc
2 // This file is part of libpbe; see http://decimail.org
3 // (C) 2004 Philip Endecott
4 
5 // This program is free software; you can redistribute it and/or modify
6 // it under the terms of the GNU General Public License as published by
7 // the Free Software Foundation; either version 2 of the License, or
8 // any later version.
9 //
10 // This program is distributed in the hope that it will be useful,
11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 // GNU General Public License for more details.
14 //
15 // You should have received a copy of the GNU General Public License
16 // along with this program; if not, write to the Free Software
17 // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18 #include "markup.hh"
19 
20 #include "StringTransformer.hh"
21 
22 #include <regex.h>
23 #include <assert.h>
24 
25 
26 class AtDotToWords: public StringTransformer {
27 public:
AtDotToWords()28   AtDotToWords() {
29     add_cs_rule('@'," AT ");
30     add_cs_rule('.'," DOT ");
31   }
32 };
33 
34 static AtDotToWords at_dot_to_words;
35 
36 
render_email(string e)37 static string render_email(string e)
38 {
39   string h = "<span class=\"ofsce\">"+at_dot_to_words(e)+"</span>";
40   return h;
41 }
42 
43 
render_uri(string u)44 static string render_uri(string u)
45 {
46   string uri;
47   if (u.substr(0,4)!="http") {
48     uri = "http://" + u;
49   } else {
50     uri=u;
51   }
52   string h = "<a href=\"";
53   h += uri;
54   h += "\">";
55   h += u;
56   h += "</a>";
57   return h;
58 }
59 
60 
61 // See RFC2396 for URI syntax.
62 
63 // We make the protocol prefix optional, so www.foo.com works.
64 
65 // We require that the last component of the domain name is purely
66 // alphabetic and has at least two characters.  This is true in
67 // practice (.com, .uk) and helps to avoid false positives ("e.g.").
68 
69 // We require that the last character of a path segment is not a
70 // common punctuation character: .!):;,  This is because
71 // these rarely fall at the end of a true URI but are often placed
72 // after a URI when it is written in text.
73 
74 // We apply this after escaping & to &amp;.  This is only an issue in
75 // the path where ; normally has a special meaning which we have to
76 // ignore.
77 
78 // We reuse the URI domain rules for emails.
79 
80 // For email local parts we use the list of characters allowed by
81 // RFC2822, but allow adjacent .s.
82 
83 // We disallow @ in URIs so that we can use it to distinguish emails
84 // from URIs.  They should be allowed in paths, queries and
85 // fragment-ids.
86 
87 
88 const char* uri_regexp =
89 "(https?://)?"  // optional protocol
90 "[-a-zA-Z0-9]+(\\.[-a-zA-Z0-9]+)*\\.[a-zA-Z][a-zA-Z]+"  // hostname
91 "(:[0-9]+)?"  // optional port number
92 "(/[-a-zA-Z0-9_.!~*'()%:&;=+$,]*[-a-zA-Z0-9_~*'(%&=+$])*"  // path, may be empty
93 "/?"  // path to directory may be terminated with a /
94 "(\\?[-;/?:&=+$,a-zA-Z0-9_.!~*'()]*)?"  // optional queery
95 "(#[-;/?:&=+$,a-zA-Z0-9_.!~*'()]*)?"  // optional fragment-id
96 ;
97 
98 const char* email_regexp =
99 "[a-zA-Z0-9!#$%&'*+-/=?^_`{|}~.]+"  // local-part
100 "@"
101 "[-a-zA-Z0-9]+(\\.[-a-zA-Z0-9]+)*\\.[a-zA-Z][a-zA-Z]+"  // hostname
102 ;
103 
104 
markup_uris_emails(string & text)105 void markup_uris_emails ( string& text )
106 {
107   static bool re_compiled = false;
108   static regex_t re;
109   if (!re_compiled) {
110     string regexp = string("(") + uri_regexp + ")|(" + email_regexp +")";
111     int rc = regcomp(&re, regexp.c_str(), REG_EXTENDED);
112     assert(rc==0);
113     re_compiled=true;
114   }
115 
116   string r;
117   regmatch_t match;
118 
119   int pos=0;
120   while (1) {
121     int rc = regexec(&re, text.c_str()+pos, 1, &match, 0);
122     assert((rc==0) || (rc==REG_NOMATCH));
123     if (rc==REG_NOMATCH) {
124       r.append(text.substr(pos));
125       break;
126     }
127     r += text.substr(pos,match.rm_so);
128     string p = text.substr(pos+match.rm_so,match.rm_eo-match.rm_so);
129     if (p.find('@')!=p.npos) {
130       r += render_email(p);
131     } else {
132       r += render_uri(p);
133     }
134     pos += match.rm_eo;
135   }
136 
137   text = r;
138 }
139