1## another case of deja-vu
2## this time, we want the slashdot style (what Yahoo said to do) only allow
3## certain tags... we'll make it an option
4## we'll have to tie this in some way to our HTML body displayer...
5##
6## Ok, there are basically four types of tags:
7## 1) safe - ie, <b>, <i>, etc.
8## 2) render problems - <table><form><body><frame> - these we either strip,
9##    or we have to ensure they match
10## 3) definitely evil independent tags that we always strip
11## 4) definitely evil tags which denote a region, we strip the entire region
12
13from PassSGMLParser import PassSGMLParser
14from urllib import basejoin
15import string, sys
16import neo_cgi
17
18try:
19  from cStringIO import StringIO
20except:
21  from StringIO import StringIO
22
23class SafeHtml (PassSGMLParser):
24  _safeTags = {"P":1, "LI":1, "DD":1, "DT":1, "EM":1, "BR":1, "CITE":1,
25               "DFN":1, "Q":1, "STRONG":1, "IMG":1, "HR":1,
26               "TR":1, "TD":1, "TH":1, "CAPTION":1, "THEAD":1, "TFOOT":1,
27               "TBODY":1}
28  _matchTags = {"TABLE":1, "OL":1, "UL":1, "DL":1, "CENTER":1, "DIV":1, "PRE":1,
29                "SUB":1, "SUP":1, "BIG":1, "SMALL":1, "CODE":1,
30                "B":1, "I":1, "A":1, "TT":1, "BLOCKQUOTE":1, "U":1,
31                "H1":1, "H2":1, "H3":1, "H4":1, "H5":1, "H6":1, "FONT":1}
32  _skipTags = {"FORM":1, "HTML":1, "BODY":1, "EMBED":1, "AREA":1, "MAP":1,
33               "FRAME":1, "FRAMESET":1, "IFRAME":1, "META":1}
34  _stripTags = {"HEAD":1, "JAVA":1, "APPLET":1, "OBJECT":1,
35                "JAVASCRIPT":1, "LAYER":1, "STYLE":1, "SCRIPT":1}
36
37  def __init__ (self, fp, extra_safe=1, base=None, map_urls=None, new_window=1):
38    self._extra_safe = extra_safe
39    PassSGMLParser.__init__ (self, fp, extra_safe)
40    self._matchDict = {}
41    self._stripping = 0
42    self._base = base
43    self._map_urls = map_urls
44    self._new_window = new_window
45
46  def safe_start_strip (self):
47    if self._stripping == 0:
48      self.flush()
49    self._stripping = self._stripping + 1
50
51  def safe_end_strip (self):
52    self.flush()
53    self._stripping = self._stripping - 1
54    if self._stripping < 0: self._stripping = 0
55
56  def write (self, data):
57    # sys.stderr.write("write[%d] %s\n" % (self._stripping, data))
58    if self._stripping == 0:
59      # sys.stderr.write("write %s\n" % data)
60      PassSGMLParser.write(self, data)
61
62  def cleanup_attrs (self, tag, attrs):
63    new_attrs = []
64    tag = string.lower(tag)
65    if self._new_window and tag == "a":
66        new_attrs.append(('target', '_blank'))
67    for name, value in attrs:
68      name = string.lower(name)
69      if name[:2] == "on": continue   ## skip any javascript events
70      if string.lower(value)[:11] == "javascript:": continue
71      if self._map_urls and name in ["action", "href", "src", "lowsrc", "background"] and value[:4] == 'cid:':
72        try:
73          value = self._map_urls[value[4:]]
74        except KeyError:
75          pass
76      else:
77          if self._base and name in ["action", "href", "src", "lowsrc", "background"]:
78            value = basejoin (self._base, value)
79          if name in ["action", "href", "src", "lowsrc", "background"]:
80            value = 'http://www.google.com/url?sa=D&q=%s' % (neo_cgi.urlEscape(value))
81      if self._new_window and tag == "a" and name == "target": continue
82      new_attrs.append ((name, value))
83    return new_attrs
84
85  def unknown_starttag(self, tag, attrs):
86    tag = string.upper(tag)
87    if SafeHtml._stripTags.has_key(tag):
88      self.safe_start_strip()
89      # sys.stderr.write("Stripping tag %s: %d\n" % (tag, self._stripping))
90    elif SafeHtml._skipTags.has_key(tag):
91      # sys.stderr.write("Skipping tag %s\n" % tag)
92      pass
93    elif SafeHtml._matchTags.has_key(tag):
94      # sys.stderr.write("Matching tag %s\n" % tag)
95      if self._matchDict.has_key(tag):
96        self._matchDict[tag] = self._matchDict[tag] + 1
97      else:
98        self._matchDict[tag] = 1
99      self.write_starttag (tag, self.cleanup_attrs(tag, attrs))
100    elif SafeHtml._safeTags.has_key(tag):
101      # sys.stderr.write("Safe tag %s\n" % tag)
102      self.write_starttag (tag, self.cleanup_attrs(tag, attrs))
103    elif not self._extra_safe:
104      # sys.stderr.write("Other tag %s\n" % tag)
105      self.write_starttag (tag, self.cleanup_attrs(tag, attrs))
106
107  def unknown_endtag(self, tag):
108    tag = string.upper(tag)
109    if SafeHtml._stripTags.has_key(tag):
110      self.safe_end_strip()
111      # sys.stderr.write("End Stripping tag %s: %d\n" % (tag, self._stripping))
112    elif self._stripping == 0:
113      if SafeHtml._skipTags.has_key(tag):
114        pass
115      elif SafeHtml._matchTags.has_key(tag):
116        if self._matchDict.has_key(tag):
117          self._matchDict[tag] = self._matchDict[tag] - 1
118        self.write_endtag (tag)
119      elif SafeHtml._safeTags.has_key(tag):
120        self.write_endtag (tag)
121      elif not self._extra_safe:
122        self.write_endtag (tag)
123
124  def close (self):
125    self._stripping = 0
126    for tag in self._matchDict.keys():
127      if self._matchDict[tag] > 0:
128        for x in range (self._matchDict[tag]):
129          self.write_endtag(tag)
130    PassSGMLParser.close(self)
131
132def SafeHtmlString (s, really_safe=1, map_urls=None):
133#  fp = open("/tmp/safe_html.in", "w")
134#  fp.write(s)
135#  fp.close()
136  fp = StringIO()
137  parser = SafeHtml(fp, really_safe, map_urls=map_urls)
138  parser.feed (s)
139  parser.close ()
140  s = fp.getvalue()
141#  fp = open("/tmp/safe_html.out", "w")
142#  fp.write(s)
143#  fp.close()
144  return s
145
146