1##
2# This file is part of WhatWeb and may be subject to
3# redistribution and commercial restrictions. Please see the WhatWeb
4# web site for more information on licensing and terms of use.
5# http://www.morningstarsecurity.com/research/whatweb
6##
7
8## Version 0.2
9# added example sites
10
11Plugin.define "Charset" do
12author "Andrew Horton"
13version "0.2"
14description "Detects the character set of a page, this is required for MongoDB output. Only checks the meta content tag, not the HTTP header. It tries the specified charset, then ascii, then utf-8 then uses CharDet from the rchardet gem which is CPU intensive / slow. Tests conversion to UTF-8 using Iconv. Returns Failed if unsuccessful."
15
16
17# requires rchardet
18# https://github.com/jmhodges/rchardet
19# http://www.meeho.net/blog/2010/03/ruby-how-to-detect-the-encoding-of-a-string/
20
21def startup
22	if not defined?(CharDet)
23		error("ERROR: You need the rchardet gem to use the CharDet plugin to detect anything but ASCII or UTF-8.")
24	end
25end
26
27def get_charset(body)
28	charset=nil
29	meta_content_tag=body.scan(/<meta[^>]+Content-Type[^>]+>/i)[0]
30#	puts meta_content_tag
31	unless meta_content_tag.nil? or not meta_content_tag =~ /charset=['"]?([a-zA-Z0-9_-]+)/i
32		charset=meta_content_tag.scan(/charset=['"]?([a-zA-Z0-9_-]+)/i)[0][0]
33		charset.upcase!
34	end
35	charset
36end
37
38def passive
39	m=[]
40
41	body=@body
42=begin
43		Arabic (Windows)	Windows-1256
44		Baltic (Windows)	Windows-1257
45		Central European (Windows)	Windows-1250
46		Cyrillic (Windows)	Windows-1251
47		Greek (Windows)	Windows-1253
48		Hebrew (Windows)	Windows-1255
49		Thai (Windows)	TIS-620
50		Turkish (Windows)	Windows-1254
51		Vietnamese (Windows)	Windows-1258
52		Western European (Windows)	Windows-1252
53
54		Arabic (ISO)	ISO-8859-6
55		Baltic (ISO)	ISO-8859-4
56		Central European (ISO)	ISO-8859-2
57		Cyrillic (ISO)	ISO-8859-5
58		Estonian (ISO)	ISO-8859-13
59		Greek (ISO)	ISO-8859-7
60		Hebrew (ISO-Logical)	ISO-8859-8-l
61		Hebrew (ISO-Visual)	ISO-8859-8
62		Latin 9 (ISO)	ISO-8859-15
63		Turkish (ISO)	ISO-8859-9
64		Western European (ISO)	ISO-8859-1
65
66		Chinese Simplified (GB18030)	GB18030
67		Chinese Simplified (GB2312)	GB2312
68		Chinese Simplified (HZ)	HZ
69		Chinese Traditional (Big5)	Big5
70		Japanese (Shift-JIS)	Shift_JIS
71		Japanese (EUC)	EUC-JP
72		Korean	EUC-KR
73		Unicode (UTF-8)	UTF-8
74=end
75		#UTF-7 ISO-8859-5 ISO-8859-1 ISO-2022-JP WINDOWS-1250 IBM852 EUC-JP SHIFT_JIS BIG5 UTF-8 ASCII
76
77
78	trythese = %w| UTF_8 ASCII | # it's stack backwards
79
80	charset=get_charset(body)
81	trythese.push(charset) unless charset.nil?
82
83
84	found=false
85	while trythis = trythese.pop
86	begin
87        d = body.force_encoding('UTF-8')
88		found=true
89		m << {:string=> trythis}
90		break
91	rescue
92		#
93	end
94
95	if defined?(CharDet)
96		if found==false
97			begin
98				cd = CharDet.detect(body)
99				encoding = cd['encoding'].upcase
100                d = body.force_encoding('UTF-8')
101				found=true
102				m << {:string=> encoding, :module=> "CharDet"}
103			rescue
104			end
105		end
106	end
107		m << {:name=>"x",:string=> "Failed"} if found==false
108	end
109
110	m
111end
112
113end
114
115