1#!/usr/bin/env python
2
3"""
4Copyright (c) 2006-2019 sqlmap developers (http://sqlmap.org/)
5See the file 'LICENSE' for copying permission
6"""
7
8import codecs
9import gzip
10import io
11import logging
12import re
13import struct
14import zlib
15
16from lib.core.common import Backend
17from lib.core.common import extractErrorMessage
18from lib.core.common import extractRegexResult
19from lib.core.common import filterNone
20from lib.core.common import getPublicTypeMembers
21from lib.core.common import getSafeExString
22from lib.core.common import isListLike
23from lib.core.common import randomStr
24from lib.core.common import readInput
25from lib.core.common import resetCookieJar
26from lib.core.common import singleTimeLogMessage
27from lib.core.common import singleTimeWarnMessage
28from lib.core.common import unArrayizeValue
29from lib.core.convert import decodeHex
30from lib.core.convert import getBytes
31from lib.core.convert import getText
32from lib.core.convert import getUnicode
33from lib.core.data import conf
34from lib.core.data import kb
35from lib.core.data import logger
36from lib.core.decorators import cachedmethod
37from lib.core.decorators import lockedmethod
38from lib.core.dicts import HTML_ENTITIES
39from lib.core.enums import DBMS
40from lib.core.enums import HTTP_HEADER
41from lib.core.enums import PLACE
42from lib.core.exception import SqlmapCompressionException
43from lib.core.settings import BLOCKED_IP_REGEX
44from lib.core.settings import DEFAULT_COOKIE_DELIMITER
45from lib.core.settings import EVENTVALIDATION_REGEX
46from lib.core.settings import IDENTYWAF_PARSE_LIMIT
47from lib.core.settings import MAX_CONNECTION_TOTAL_SIZE
48from lib.core.settings import META_CHARSET_REGEX
49from lib.core.settings import PARSE_HEADERS_LIMIT
50from lib.core.settings import SELECT_FROM_TABLE_REGEX
51from lib.core.settings import UNICODE_ENCODING
52from lib.core.settings import VIEWSTATE_REGEX
53from lib.parse.headers import headersParser
54from lib.parse.html import htmlParser
55from thirdparty import six
56from thirdparty.chardet import detect
57from thirdparty.identywaf import identYwaf
58from thirdparty.odict import OrderedDict
59from thirdparty.six import unichr as _unichr
60from thirdparty.six.moves import http_client as _http_client
61
62@lockedmethod
63def forgeHeaders(items=None, base=None):
64    """
65    Prepare HTTP Cookie, HTTP User-Agent and HTTP Referer headers to use when performing
66    the HTTP requests
67    """
68
69    items = items or {}
70
71    for _ in list(items.keys()):
72        if items[_] is None:
73            del items[_]
74
75    headers = OrderedDict(conf.httpHeaders if base is None else base)
76    headers.update(items.items())
77
78    class _str(str):
79        def capitalize(self):
80            return _str(self)
81
82        def title(self):
83            return _str(self)
84
85    _ = headers
86    headers = OrderedDict()
87    for key, value in _.items():
88        success = False
89
90        for _ in headers:
91            if _.upper() == key.upper():
92                del headers[_]
93                break
94
95        if key.upper() not in (_.upper() for _ in getPublicTypeMembers(HTTP_HEADER, True)):
96            try:
97                headers[_str(key)] = value  # dirty hack for http://bugs.python.org/issue12455
98            except UnicodeEncodeError:      # don't do the hack on non-ASCII header names (they have to be properly encoded later on)
99                pass
100            else:
101                success = True
102        if not success:
103            key = '-'.join(_.capitalize() for _ in key.split('-'))
104            headers[key] = value
105
106    if conf.cj:
107        if HTTP_HEADER.COOKIE in headers:
108            for cookie in conf.cj:
109                if cookie.domain_specified and not (conf.hostname or "").endswith(cookie.domain):
110                    continue
111
112                if ("%s=" % getUnicode(cookie.name)) in getUnicode(headers[HTTP_HEADER.COOKIE]):
113                    if conf.loadCookies:
114                        conf.httpHeaders = filterNone((item if item[0] != HTTP_HEADER.COOKIE else None) for item in conf.httpHeaders)
115                    elif kb.mergeCookies is None:
116                        message = "you provided a HTTP %s header value, while " % HTTP_HEADER.COOKIE
117                        message += "target URL provides its own cookies within "
118                        message += "HTTP %s header which intersect with yours. " % HTTP_HEADER.SET_COOKIE
119                        message += "Do you want to merge them in further requests? [Y/n] "
120
121                        kb.mergeCookies = readInput(message, default='Y', boolean=True)
122
123                    if kb.mergeCookies and kb.injection.place != PLACE.COOKIE:
124                        def _(value):
125                            return re.sub(r"(?i)\b%s=[^%s]+" % (re.escape(getUnicode(cookie.name)), conf.cookieDel or DEFAULT_COOKIE_DELIMITER), ("%s=%s" % (getUnicode(cookie.name), getUnicode(cookie.value))).replace('\\', r'\\'), value)
126
127                        headers[HTTP_HEADER.COOKIE] = _(headers[HTTP_HEADER.COOKIE])
128
129                        if PLACE.COOKIE in conf.parameters:
130                            conf.parameters[PLACE.COOKIE] = _(conf.parameters[PLACE.COOKIE])
131
132                        conf.httpHeaders = [(item[0], item[1] if item[0] != HTTP_HEADER.COOKIE else _(item[1])) for item in conf.httpHeaders]
133
134                elif not kb.testMode:
135                    headers[HTTP_HEADER.COOKIE] += "%s %s=%s" % (conf.cookieDel or DEFAULT_COOKIE_DELIMITER, getUnicode(cookie.name), getUnicode(cookie.value))
136
137        if kb.testMode and not any((conf.csrfToken, conf.safeUrl)):
138            resetCookieJar(conf.cj)
139
140    return headers
141
142def parseResponse(page, headers, status=None):
143    """
144    @param page: the page to parse to feed the knowledge base htmlFp
145    (back-end DBMS fingerprint based upon DBMS error messages return
146    through the web application) list and absFilePaths (absolute file
147    paths) set.
148    """
149
150    if headers:
151        headersParser(headers)
152
153    if page:
154        htmlParser(page if not status else "%s\n\n%s" % (status, page))
155
156@cachedmethod
157def checkCharEncoding(encoding, warn=True):
158    """
159    Checks encoding name, repairs common misspellings and adjusts to
160    proper namings used in codecs module
161
162    >>> checkCharEncoding('iso-8858', False)
163    'iso8859-1'
164    >>> checkCharEncoding('en_us', False)
165    'utf8'
166    """
167
168    if isinstance(encoding, six.binary_type):
169        encoding = getUnicode(encoding)
170
171    if isListLike(encoding):
172        encoding = unArrayizeValue(encoding)
173
174    if encoding:
175        encoding = encoding.lower()
176    else:
177        return encoding
178
179    # Reference: http://www.destructor.de/charsets/index.htm
180    translate = {"windows-874": "iso-8859-11", "utf-8859-1": "utf8", "en_us": "utf8", "macintosh": "iso-8859-1", "euc_tw": "big5_tw", "th": "tis-620", "unicode": "utf8", "utc8": "utf8", "ebcdic": "ebcdic-cp-be", "iso-8859": "iso8859-1", "iso-8859-0": "iso8859-1", "ansi": "ascii", "gbk2312": "gbk", "windows-31j": "cp932", "en": "us"}
181
182    for delimiter in (';', ',', '('):
183        if delimiter in encoding:
184            encoding = encoding[:encoding.find(delimiter)].strip()
185
186    encoding = encoding.replace("&quot", "")
187
188    # popular typos/errors
189    if "8858" in encoding:
190        encoding = encoding.replace("8858", "8859")  # iso-8858 -> iso-8859
191    elif "8559" in encoding:
192        encoding = encoding.replace("8559", "8859")  # iso-8559 -> iso-8859
193    elif "8895" in encoding:
194        encoding = encoding.replace("8895", "8859")  # iso-8895 -> iso-8859
195    elif "5889" in encoding:
196        encoding = encoding.replace("5889", "8859")  # iso-5889 -> iso-8859
197    elif "5589" in encoding:
198        encoding = encoding.replace("5589", "8859")  # iso-5589 -> iso-8859
199    elif "2313" in encoding:
200        encoding = encoding.replace("2313", "2312")  # gb2313 -> gb2312
201    elif encoding.startswith("x-"):
202        encoding = encoding[len("x-"):]              # x-euc-kr -> euc-kr  /  x-mac-turkish -> mac-turkish
203    elif "windows-cp" in encoding:
204        encoding = encoding.replace("windows-cp", "windows")  # windows-cp-1254 -> windows-1254
205
206    # name adjustment for compatibility
207    if encoding.startswith("8859"):
208        encoding = "iso-%s" % encoding
209    elif encoding.startswith("cp-"):
210        encoding = "cp%s" % encoding[3:]
211    elif encoding.startswith("euc-"):
212        encoding = "euc_%s" % encoding[4:]
213    elif encoding.startswith("windows") and not encoding.startswith("windows-"):
214        encoding = "windows-%s" % encoding[7:]
215    elif encoding.find("iso-88") > 0:
216        encoding = encoding[encoding.find("iso-88"):]
217    elif encoding.startswith("is0-"):
218        encoding = "iso%s" % encoding[4:]
219    elif encoding.find("ascii") > 0:
220        encoding = "ascii"
221    elif encoding.find("utf8") > 0:
222        encoding = "utf8"
223    elif encoding.find("utf-8") > 0:
224        encoding = "utf-8"
225
226    # Reference: http://philip.html5.org/data/charsets-2.html
227    if encoding in translate:
228        encoding = translate[encoding]
229    elif encoding in ("null", "{charset}", "charset", "*") or not re.search(r"\w", encoding):
230        return None
231
232    # Reference: http://www.iana.org/assignments/character-sets
233    # Reference: http://docs.python.org/library/codecs.html
234    try:
235        codecs.lookup(encoding)
236    except:
237        encoding = None
238
239    if encoding:
240        try:
241            six.text_type(getBytes(randomStr()), encoding)
242        except:
243            if warn:
244                warnMsg = "invalid web page charset '%s'" % encoding
245                singleTimeLogMessage(warnMsg, logging.WARN, encoding)
246            encoding = None
247
248    return encoding
249
250def getHeuristicCharEncoding(page):
251    """
252    Returns page encoding charset detected by usage of heuristics
253
254    Reference: https://chardet.readthedocs.io/en/latest/usage.html
255
256    >>> getHeuristicCharEncoding(b"<html></html>")
257    'ascii'
258    """
259
260    key = hash(page)
261    retVal = kb.cache.encoding.get(key) or detect(page)["encoding"]
262    kb.cache.encoding[key] = retVal
263
264    if retVal and retVal.lower().replace('-', "") == UNICODE_ENCODING.lower().replace('-', ""):
265        infoMsg = "heuristics detected web page charset '%s'" % retVal
266        singleTimeLogMessage(infoMsg, logging.INFO, retVal)
267
268    return retVal
269
270def decodePage(page, contentEncoding, contentType, percentDecode=True):
271    """
272    Decode compressed/charset HTTP response
273
274    >>> getText(decodePage(b"<html>foo&amp;bar</html>", None, "text/html; charset=utf-8"))
275    '<html>foo&bar</html>'
276    """
277
278    if not page or (conf.nullConnection and len(page) < 2):
279        return getUnicode(page)
280
281    if hasattr(contentEncoding, "lower"):
282        contentEncoding = contentEncoding.lower()
283    else:
284        contentEncoding = ""
285
286    if hasattr(contentType, "lower"):
287        contentType = contentType.lower()
288    else:
289        contentType = ""
290
291    if contentEncoding in ("gzip", "x-gzip", "deflate"):
292        if not kb.pageCompress:
293            return None
294
295        try:
296            if contentEncoding == "deflate":
297                data = io.BytesIO(zlib.decompress(page, -15))  # Reference: http://stackoverflow.com/questions/1089662/python-inflate-and-deflate-implementations
298            else:
299                data = gzip.GzipFile("", "rb", 9, io.BytesIO(page))
300                size = struct.unpack("<l", page[-4:])[0]  # Reference: http://pydoc.org/get.cgi/usr/local/lib/python2.5/gzip.py
301                if size > MAX_CONNECTION_TOTAL_SIZE:
302                    raise Exception("size too large")
303
304            page = data.read()
305        except Exception as ex:
306            if "<html" not in page:  # in some cases, invalid "Content-Encoding" appears for plain HTML (should be ignored)
307                errMsg = "detected invalid data for declared content "
308                errMsg += "encoding '%s' ('%s')" % (contentEncoding, getSafeExString(ex))
309                singleTimeLogMessage(errMsg, logging.ERROR)
310
311                warnMsg = "turning off page compression"
312                singleTimeWarnMessage(warnMsg)
313
314                kb.pageCompress = False
315                raise SqlmapCompressionException
316
317    if not conf.encoding:
318        httpCharset, metaCharset = None, None
319
320        # Reference: http://stackoverflow.com/questions/1020892/python-urllib2-read-to-unicode
321        if contentType.find("charset=") != -1:
322            httpCharset = checkCharEncoding(contentType.split("charset=")[-1])
323
324        metaCharset = checkCharEncoding(extractRegexResult(META_CHARSET_REGEX, page))
325
326        if (any((httpCharset, metaCharset)) and not all((httpCharset, metaCharset))) or (httpCharset == metaCharset and all((httpCharset, metaCharset))):
327            kb.pageEncoding = httpCharset or metaCharset  # Reference: http://bytes.com/topic/html-css/answers/154758-http-equiv-vs-true-header-has-precedence
328            debugMsg = "declared web page charset '%s'" % kb.pageEncoding
329            singleTimeLogMessage(debugMsg, logging.DEBUG, debugMsg)
330        else:
331            kb.pageEncoding = None
332    else:
333        kb.pageEncoding = conf.encoding
334
335    # can't do for all responses because we need to support binary files too
336    if isinstance(page, six.binary_type) and "text/" in contentType:
337        if not kb.disableHtmlDecoding:
338            # e.g. &#x9;&#195;&#235;&#224;&#226;&#224;
339            if b"&#" in page:
340                page = re.sub(b"&#x([0-9a-f]{1,2});", lambda _: decodeHex(_.group(1) if len(_.group(1)) == 2 else "0%s" % _.group(1)), page)
341                page = re.sub(b"&#(\\d{1,3});", lambda _: six.int2byte(int(_.group(1))) if int(_.group(1)) < 256 else _.group(0), page)
342
343            # e.g. %20%28%29
344            if percentDecode:
345                if b"%" in page:
346                    page = re.sub(b"%([0-9a-fA-F]{2})", lambda _: decodeHex(_.group(1)), page)
347
348            # e.g. &amp;
349            page = re.sub(b"&([^;]+);", lambda _: six.int2byte(HTML_ENTITIES[getText(_.group(1))]) if HTML_ENTITIES.get(getText(_.group(1)), 256) < 256 else _.group(0), page)
350
351            kb.pageEncoding = kb.pageEncoding or checkCharEncoding(getHeuristicCharEncoding(page))
352
353            if (kb.pageEncoding or "").lower() == "utf-8-sig":
354                kb.pageEncoding = "utf-8"
355                if page and page.startswith("\xef\xbb\xbf"):  # Reference: https://docs.python.org/2/library/codecs.html (Note: noticed problems when "utf-8-sig" is left to Python for handling)
356                    page = page[3:]
357
358            page = getUnicode(page, kb.pageEncoding)
359
360            # e.g. &#8217;&#8230;&#8482;
361            if "&#" in page:
362                def _(match):
363                    retVal = match.group(0)
364                    try:
365                        retVal = _unichr(int(match.group(1)))
366                    except (ValueError, OverflowError):
367                        pass
368                    return retVal
369                page = re.sub(r"&#(\d+);", _, page)
370
371            # e.g. &zeta;
372            page = re.sub(r"&([^;]+);", lambda _: _unichr(HTML_ENTITIES[_.group(1)]) if HTML_ENTITIES.get(_.group(1), 0) > 255 else _.group(0), page)
373        else:
374            page = getUnicode(page, kb.pageEncoding)
375
376    return page
377
378def processResponse(page, responseHeaders, code=None, status=None):
379    kb.processResponseCounter += 1
380
381    page = page or ""
382
383    parseResponse(page, responseHeaders if kb.processResponseCounter < PARSE_HEADERS_LIMIT else None, status)
384
385    if not kb.tableFrom and Backend.getIdentifiedDbms() in (DBMS.ACCESS,):
386        kb.tableFrom = extractRegexResult(SELECT_FROM_TABLE_REGEX, page)
387    else:
388        kb.tableFrom = None
389
390    if conf.parseErrors:
391        msg = extractErrorMessage(page)
392
393        if msg:
394            logger.warning("parsed DBMS error message: '%s'" % msg.rstrip('.'))
395
396    if kb.processResponseCounter < IDENTYWAF_PARSE_LIMIT:
397        rawResponse = "%s %s %s\n%s\n%s" % (_http_client.HTTPConnection._http_vsn_str, code or "", status or "", getUnicode("".join(responseHeaders.headers if responseHeaders else [])), page)
398
399        identYwaf.non_blind.clear()
400        if identYwaf.non_blind_check(rawResponse, silent=True):
401            for waf in identYwaf.non_blind:
402                if waf not in kb.identifiedWafs:
403                    kb.identifiedWafs.add(waf)
404                    errMsg = "WAF/IPS identified as '%s'" % identYwaf.format_name(waf)
405                    singleTimeLogMessage(errMsg, logging.CRITICAL)
406
407    if kb.originalPage is None:
408        for regex in (EVENTVALIDATION_REGEX, VIEWSTATE_REGEX):
409            match = re.search(regex, page)
410            if match and PLACE.POST in conf.parameters:
411                name, value = match.groups()
412                if PLACE.POST in conf.paramDict and name in conf.paramDict[PLACE.POST]:
413                    if conf.paramDict[PLACE.POST][name] in page:
414                        continue
415                    else:
416                        msg = "do you want to automatically adjust the value of '%s'? [y/N]" % name
417
418                        if not readInput(msg, default='N', boolean=True):
419                            continue
420
421                        conf.paramDict[PLACE.POST][name] = value
422                conf.parameters[PLACE.POST] = re.sub(r"(?i)(%s=)[^&]+" % re.escape(name), r"\g<1>%s" % value.replace('\\', r'\\'), conf.parameters[PLACE.POST])
423
424    if not kb.browserVerification and re.search(r"(?i)browser.?verification", page or ""):
425        kb.browserVerification = True
426        warnMsg = "potential browser verification protection mechanism detected"
427        if re.search(r"(?i)CloudFlare", page):
428            warnMsg += " (CloudFlare)"
429        singleTimeWarnMessage(warnMsg)
430
431    if not kb.captchaDetected and re.search(r"(?i)captcha", page or ""):
432        for match in re.finditer(r"(?si)<form.+?</form>", page):
433            if re.search(r"(?i)captcha", match.group(0)):
434                kb.captchaDetected = True
435                break
436
437        if re.search(r"<meta[^>]+\brefresh\b[^>]+\bcaptcha\b", page):
438            kb.captchaDetected = True
439
440        if kb.captchaDetected:
441            warnMsg = "potential CAPTCHA protection mechanism detected"
442            if re.search(r"(?i)<title>[^<]*CloudFlare", page):
443                warnMsg += " (CloudFlare)"
444            singleTimeWarnMessage(warnMsg)
445
446    if re.search(BLOCKED_IP_REGEX, page):
447        warnMsg = "it appears that you have been blocked by the target server"
448        singleTimeWarnMessage(warnMsg)
449