1## 2# This file is part of WhatWeb and may be subject to 3# redistribution and commercial restrictions. Please see the WhatWeb 4# web site for more information on licensing and terms of use. 5# http://www.morningstarsecurity.com/research/whatweb 6## 7 8## Version 0.2 9# added example sites 10 11Plugin.define "Charset" do 12author "Andrew Horton" 13version "0.2" 14description "Detects the character set of a page, this is required for MongoDB output. Only checks the meta content tag, not the HTTP header. It tries the specified charset, then ascii, then utf-8 then uses CharDet from the rchardet gem which is CPU intensive / slow. Tests conversion to UTF-8 using Iconv. Returns Failed if unsuccessful." 15 16 17# requires rchardet 18# https://github.com/jmhodges/rchardet 19# http://www.meeho.net/blog/2010/03/ruby-how-to-detect-the-encoding-of-a-string/ 20 21def startup 22 if not defined?(CharDet) 23 error("ERROR: You need the rchardet gem to use the CharDet plugin to detect anything but ASCII or UTF-8.") 24 end 25end 26 27def get_charset(body) 28 charset=nil 29 meta_content_tag=body.scan(/<meta[^>]+Content-Type[^>]+>/i)[0] 30# puts meta_content_tag 31 unless meta_content_tag.nil? or not meta_content_tag =~ /charset=['"]?([a-zA-Z0-9_-]+)/i 32 charset=meta_content_tag.scan(/charset=['"]?([a-zA-Z0-9_-]+)/i)[0][0] 33 charset.upcase! 34 end 35 charset 36end 37 38def passive 39 m=[] 40 41 body=@body 42=begin 43 Arabic (Windows) Windows-1256 44 Baltic (Windows) Windows-1257 45 Central European (Windows) Windows-1250 46 Cyrillic (Windows) Windows-1251 47 Greek (Windows) Windows-1253 48 Hebrew (Windows) Windows-1255 49 Thai (Windows) TIS-620 50 Turkish (Windows) Windows-1254 51 Vietnamese (Windows) Windows-1258 52 Western European (Windows) Windows-1252 53 54 Arabic (ISO) ISO-8859-6 55 Baltic (ISO) ISO-8859-4 56 Central European (ISO) ISO-8859-2 57 Cyrillic (ISO) ISO-8859-5 58 Estonian (ISO) ISO-8859-13 59 Greek (ISO) ISO-8859-7 60 Hebrew (ISO-Logical) ISO-8859-8-l 61 Hebrew (ISO-Visual) ISO-8859-8 62 Latin 9 (ISO) ISO-8859-15 63 Turkish (ISO) ISO-8859-9 64 Western European (ISO) ISO-8859-1 65 66 Chinese Simplified (GB18030) GB18030 67 Chinese Simplified (GB2312) GB2312 68 Chinese Simplified (HZ) HZ 69 Chinese Traditional (Big5) Big5 70 Japanese (Shift-JIS) Shift_JIS 71 Japanese (EUC) EUC-JP 72 Korean EUC-KR 73 Unicode (UTF-8) UTF-8 74=end 75 #UTF-7 ISO-8859-5 ISO-8859-1 ISO-2022-JP WINDOWS-1250 IBM852 EUC-JP SHIFT_JIS BIG5 UTF-8 ASCII 76 77 78 trythese = %w| UTF_8 ASCII | # it's stack backwards 79 80 charset=get_charset(body) 81 trythese.push(charset) unless charset.nil? 82 83 84 found=false 85 while trythis = trythese.pop 86 begin 87 d = body.force_encoding('UTF-8') 88 found=true 89 m << {:string=> trythis} 90 break 91 rescue 92 # 93 end 94 95 if defined?(CharDet) 96 if found==false 97 begin 98 cd = CharDet.detect(body) 99 encoding = cd['encoding'].upcase 100 d = body.force_encoding('UTF-8') 101 found=true 102 m << {:string=> encoding, :module=> "CharDet"} 103 rescue 104 end 105 end 106 end 107 m << {:name=>"x",:string=> "Failed"} if found==false 108 end 109 110 m 111end 112 113end 114 115