1## another case of deja-vu 2## this time, we want the slashdot style (what Yahoo said to do) only allow 3## certain tags... we'll make it an option 4## we'll have to tie this in some way to our HTML body displayer... 5## 6## Ok, there are basically four types of tags: 7## 1) safe - ie, <b>, <i>, etc. 8## 2) render problems - <table><form><body><frame> - these we either strip, 9## or we have to ensure they match 10## 3) definitely evil independent tags that we always strip 11## 4) definitely evil tags which denote a region, we strip the entire region 12 13from PassSGMLParser import PassSGMLParser 14from urllib import basejoin 15import string, sys 16import neo_cgi 17 18try: 19 from cStringIO import StringIO 20except: 21 from StringIO import StringIO 22 23class SafeHtml (PassSGMLParser): 24 _safeTags = {"P":1, "LI":1, "DD":1, "DT":1, "EM":1, "BR":1, "CITE":1, 25 "DFN":1, "Q":1, "STRONG":1, "IMG":1, "HR":1, 26 "TR":1, "TD":1, "TH":1, "CAPTION":1, "THEAD":1, "TFOOT":1, 27 "TBODY":1} 28 _matchTags = {"TABLE":1, "OL":1, "UL":1, "DL":1, "CENTER":1, "DIV":1, "PRE":1, 29 "SUB":1, "SUP":1, "BIG":1, "SMALL":1, "CODE":1, 30 "B":1, "I":1, "A":1, "TT":1, "BLOCKQUOTE":1, "U":1, 31 "H1":1, "H2":1, "H3":1, "H4":1, "H5":1, "H6":1, "FONT":1} 32 _skipTags = {"FORM":1, "HTML":1, "BODY":1, "EMBED":1, "AREA":1, "MAP":1, 33 "FRAME":1, "FRAMESET":1, "IFRAME":1, "META":1} 34 _stripTags = {"HEAD":1, "JAVA":1, "APPLET":1, "OBJECT":1, 35 "JAVASCRIPT":1, "LAYER":1, "STYLE":1, "SCRIPT":1} 36 37 def __init__ (self, fp, extra_safe=1, base=None, map_urls=None, new_window=1): 38 self._extra_safe = extra_safe 39 PassSGMLParser.__init__ (self, fp, extra_safe) 40 self._matchDict = {} 41 self._stripping = 0 42 self._base = base 43 self._map_urls = map_urls 44 self._new_window = new_window 45 46 def safe_start_strip (self): 47 if self._stripping == 0: 48 self.flush() 49 self._stripping = self._stripping + 1 50 51 def safe_end_strip (self): 52 self.flush() 53 self._stripping = self._stripping - 1 54 if self._stripping < 0: self._stripping = 0 55 56 def write (self, data): 57 # sys.stderr.write("write[%d] %s\n" % (self._stripping, data)) 58 if self._stripping == 0: 59 # sys.stderr.write("write %s\n" % data) 60 PassSGMLParser.write(self, data) 61 62 def cleanup_attrs (self, tag, attrs): 63 new_attrs = [] 64 tag = string.lower(tag) 65 if self._new_window and tag == "a": 66 new_attrs.append(('target', '_blank')) 67 for name, value in attrs: 68 name = string.lower(name) 69 if name[:2] == "on": continue ## skip any javascript events 70 if string.lower(value)[:11] == "javascript:": continue 71 if self._map_urls and name in ["action", "href", "src", "lowsrc", "background"] and value[:4] == 'cid:': 72 try: 73 value = self._map_urls[value[4:]] 74 except KeyError: 75 pass 76 else: 77 if self._base and name in ["action", "href", "src", "lowsrc", "background"]: 78 value = basejoin (self._base, value) 79 if name in ["action", "href", "src", "lowsrc", "background"]: 80 value = 'http://www.google.com/url?sa=D&q=%s' % (neo_cgi.urlEscape(value)) 81 if self._new_window and tag == "a" and name == "target": continue 82 new_attrs.append ((name, value)) 83 return new_attrs 84 85 def unknown_starttag(self, tag, attrs): 86 tag = string.upper(tag) 87 if SafeHtml._stripTags.has_key(tag): 88 self.safe_start_strip() 89 # sys.stderr.write("Stripping tag %s: %d\n" % (tag, self._stripping)) 90 elif SafeHtml._skipTags.has_key(tag): 91 # sys.stderr.write("Skipping tag %s\n" % tag) 92 pass 93 elif SafeHtml._matchTags.has_key(tag): 94 # sys.stderr.write("Matching tag %s\n" % tag) 95 if self._matchDict.has_key(tag): 96 self._matchDict[tag] = self._matchDict[tag] + 1 97 else: 98 self._matchDict[tag] = 1 99 self.write_starttag (tag, self.cleanup_attrs(tag, attrs)) 100 elif SafeHtml._safeTags.has_key(tag): 101 # sys.stderr.write("Safe tag %s\n" % tag) 102 self.write_starttag (tag, self.cleanup_attrs(tag, attrs)) 103 elif not self._extra_safe: 104 # sys.stderr.write("Other tag %s\n" % tag) 105 self.write_starttag (tag, self.cleanup_attrs(tag, attrs)) 106 107 def unknown_endtag(self, tag): 108 tag = string.upper(tag) 109 if SafeHtml._stripTags.has_key(tag): 110 self.safe_end_strip() 111 # sys.stderr.write("End Stripping tag %s: %d\n" % (tag, self._stripping)) 112 elif self._stripping == 0: 113 if SafeHtml._skipTags.has_key(tag): 114 pass 115 elif SafeHtml._matchTags.has_key(tag): 116 if self._matchDict.has_key(tag): 117 self._matchDict[tag] = self._matchDict[tag] - 1 118 self.write_endtag (tag) 119 elif SafeHtml._safeTags.has_key(tag): 120 self.write_endtag (tag) 121 elif not self._extra_safe: 122 self.write_endtag (tag) 123 124 def close (self): 125 self._stripping = 0 126 for tag in self._matchDict.keys(): 127 if self._matchDict[tag] > 0: 128 for x in range (self._matchDict[tag]): 129 self.write_endtag(tag) 130 PassSGMLParser.close(self) 131 132def SafeHtmlString (s, really_safe=1, map_urls=None): 133# fp = open("/tmp/safe_html.in", "w") 134# fp.write(s) 135# fp.close() 136 fp = StringIO() 137 parser = SafeHtml(fp, really_safe, map_urls=map_urls) 138 parser.feed (s) 139 parser.close () 140 s = fp.getvalue() 141# fp = open("/tmp/safe_html.out", "w") 142# fp.write(s) 143# fp.close() 144 return s 145 146