1#!/usr/bin/env python
2
3"""
4Copyright (c) 2006-2019 sqlmap developers (http://sqlmap.org/)
5See the file 'LICENSE' for copying permission
6"""
7
8from __future__ import division
9
10import os
11import re
12import tempfile
13import time
14
15from lib.core.common import checkSameHost
16from lib.core.common import clearConsoleLine
17from lib.core.common import dataToStdout
18from lib.core.common import extractRegexResult
19from lib.core.common import findPageForms
20from lib.core.common import getSafeExString
21from lib.core.common import openFile
22from lib.core.common import readInput
23from lib.core.common import safeCSValue
24from lib.core.common import urldecode
25from lib.core.compat import xrange
26from lib.core.convert import htmlUnescape
27from lib.core.data import conf
28from lib.core.data import kb
29from lib.core.data import logger
30from lib.core.datatype import OrderedSet
31from lib.core.enums import MKSTEMP_PREFIX
32from lib.core.exception import SqlmapConnectionException
33from lib.core.exception import SqlmapSyntaxException
34from lib.core.settings import CRAWL_EXCLUDE_EXTENSIONS
35from lib.core.threads import getCurrentThreadData
36from lib.core.threads import runThreads
37from lib.parse.sitemap import parseSitemap
38from lib.request.connect import Connect as Request
39from thirdparty import six
40from thirdparty.beautifulsoup.beautifulsoup import BeautifulSoup
41from thirdparty.six.moves import http_client as _http_client
42from thirdparty.six.moves import urllib as _urllib
43
44def crawl(target, post=None, cookie=None):
45    if not target:
46        return
47
48    try:
49        visited = set()
50        threadData = getCurrentThreadData()
51        threadData.shared.value = OrderedSet()
52        threadData.shared.formsFound = False
53
54        def crawlThread():
55            threadData = getCurrentThreadData()
56
57            while kb.threadContinue:
58                with kb.locks.limit:
59                    if threadData.shared.unprocessed:
60                        current = threadData.shared.unprocessed.pop()
61                        if current in visited:
62                            continue
63                        elif conf.crawlExclude and re.search(conf.crawlExclude, current):
64                            dbgMsg = "skipping '%s'" % current
65                            logger.debug(dbgMsg)
66                            continue
67                        else:
68                            visited.add(current)
69                    else:
70                        break
71
72                content = None
73                try:
74                    if current:
75                        content = Request.getPage(url=current, post=post, cookie=None, crawling=True, raise404=False)[0]
76                except SqlmapConnectionException as ex:
77                    errMsg = "connection exception detected ('%s'). skipping " % getSafeExString(ex)
78                    errMsg += "URL '%s'" % current
79                    logger.critical(errMsg)
80                except SqlmapSyntaxException:
81                    errMsg = "invalid URL detected. skipping '%s'" % current
82                    logger.critical(errMsg)
83                except _http_client.InvalidURL as ex:
84                    errMsg = "invalid URL detected ('%s'). skipping " % getSafeExString(ex)
85                    errMsg += "URL '%s'" % current
86                    logger.critical(errMsg)
87
88                if not kb.threadContinue:
89                    break
90
91                if isinstance(content, six.text_type):
92                    try:
93                        match = re.search(r"(?si)<html[^>]*>(.+)</html>", content)
94                        if match:
95                            content = "<html>%s</html>" % match.group(1)
96
97                        soup = BeautifulSoup(content)
98                        tags = soup('a')
99
100                        tags += re.finditer(r'(?i)\s(href|src)=["\'](?P<href>[^>"\']+)', content)
101                        tags += re.finditer(r'(?i)window\.open\(["\'](?P<href>[^)"\']+)["\']', content)
102
103                        for tag in tags:
104                            href = tag.get("href") if hasattr(tag, "get") else tag.group("href")
105
106                            if href:
107                                if threadData.lastRedirectURL and threadData.lastRedirectURL[0] == threadData.lastRequestUID:
108                                    current = threadData.lastRedirectURL[1]
109                                url = _urllib.parse.urljoin(current, htmlUnescape(href))
110
111                                # flag to know if we are dealing with the same target host
112                                _ = checkSameHost(url, target)
113
114                                if conf.scope:
115                                    if not re.search(conf.scope, url, re.I):
116                                        continue
117                                elif not _:
118                                    continue
119
120                                if (extractRegexResult(r"\A[^?]+\.(?P<result>\w+)(\?|\Z)", url) or "").lower() not in CRAWL_EXCLUDE_EXTENSIONS:
121                                    with kb.locks.value:
122                                        threadData.shared.deeper.add(url)
123                                        if re.search(r"(.*?)\?(.+)", url) and not re.search(r"\?(v=)?\d+\Z", url) and not re.search(r"(?i)\.(js|css)(\?|\Z)", url):
124                                            threadData.shared.value.add(url)
125                    except UnicodeEncodeError:  # for non-HTML files
126                        pass
127                    except ValueError:          # for non-valid links
128                        pass
129                    finally:
130                        if conf.forms:
131                            threadData.shared.formsFound |= len(findPageForms(content, current, False, True)) > 0
132
133                if conf.verbose in (1, 2):
134                    threadData.shared.count += 1
135                    status = '%d/%d links visited (%d%%)' % (threadData.shared.count, threadData.shared.length, round(100.0 * threadData.shared.count / threadData.shared.length))
136                    dataToStdout("\r[%s] [INFO] %s" % (time.strftime("%X"), status), True)
137
138        threadData.shared.deeper = set()
139        threadData.shared.unprocessed = set([target])
140
141        _ = re.sub(r"(?<!/)/(?!/).*", "", target)
142        if _:
143            if target.strip('/') != _.strip('/'):
144                threadData.shared.unprocessed.add(_)
145
146        if re.search(r"\?.*\b\w+=", target):
147            threadData.shared.value.add(target)
148
149        if kb.checkSitemap is None:
150            message = "do you want to check for the existence of "
151            message += "site's sitemap(.xml) [y/N] "
152            kb.checkSitemap = readInput(message, default='N', boolean=True)
153
154        if kb.checkSitemap:
155            found = True
156            items = None
157            url = _urllib.parse.urljoin(target, "/sitemap.xml")
158            try:
159                items = parseSitemap(url)
160            except SqlmapConnectionException as ex:
161                if "page not found" in getSafeExString(ex):
162                    found = False
163                    logger.warn("'sitemap.xml' not found")
164            except:
165                pass
166            finally:
167                if found:
168                    if items:
169                        for item in items:
170                            if re.search(r"(.*?)\?(.+)", item):
171                                threadData.shared.value.add(item)
172                        if conf.crawlDepth > 1:
173                            threadData.shared.unprocessed.update(items)
174                    logger.info("%s links found" % ("no" if not items else len(items)))
175
176        if not conf.bulkFile:
177            infoMsg = "starting crawler for target URL '%s'" % target
178            logger.info(infoMsg)
179
180        for i in xrange(conf.crawlDepth):
181            threadData.shared.count = 0
182            threadData.shared.length = len(threadData.shared.unprocessed)
183            numThreads = min(conf.threads, len(threadData.shared.unprocessed))
184
185            if not conf.bulkFile:
186                logger.info("searching for links with depth %d" % (i + 1))
187
188            runThreads(numThreads, crawlThread, threadChoice=(i > 0))
189            clearConsoleLine(True)
190
191            if threadData.shared.deeper:
192                threadData.shared.unprocessed = set(threadData.shared.deeper)
193            else:
194                break
195
196    except KeyboardInterrupt:
197        warnMsg = "user aborted during crawling. sqlmap "
198        warnMsg += "will use partial list"
199        logger.warn(warnMsg)
200
201    finally:
202        clearConsoleLine(True)
203
204        if not threadData.shared.value:
205            if not (conf.forms and threadData.shared.formsFound):
206                warnMsg = "no usable links found (with GET parameters)"
207                if conf.forms:
208                    warnMsg += " or forms"
209                logger.warn(warnMsg)
210        else:
211            for url in threadData.shared.value:
212                kb.targets.add((urldecode(url, kb.pageEncoding), None, None, None, None))
213
214        if kb.targets:
215            if kb.normalizeCrawlingChoice is None:
216                message = "do you want to normalize "
217                message += "crawling results [Y/n] "
218
219                kb.normalizeCrawlingChoice = readInput(message, default='Y', boolean=True)
220
221            if kb.normalizeCrawlingChoice:
222                seen = set()
223                results = OrderedSet()
224
225                for target in kb.targets:
226                    value = "%s%s%s" % (target[0], '&' if '?' in target[0] else '?', target[2] or "")
227                    match = re.search(r"/[^/?]*\?.+\Z", value)
228                    if match:
229                        key = re.sub(r"=[^=&]*", "=", match.group(0)).strip("&?")
230                        if '=' in key and key not in seen:
231                            results.add(target)
232                            seen.add(key)
233
234                kb.targets = results
235
236            storeResultsToFile(kb.targets)
237
238def storeResultsToFile(results):
239    if not results:
240        return
241
242    if kb.storeCrawlingChoice is None:
243        message = "do you want to store crawling results to a temporary file "
244        message += "for eventual further processing with other tools [y/N] "
245
246        kb.storeCrawlingChoice = readInput(message, default='N', boolean=True)
247
248    if kb.storeCrawlingChoice:
249        handle, filename = tempfile.mkstemp(prefix=MKSTEMP_PREFIX.CRAWLER, suffix=".csv" if conf.forms else ".txt")
250        os.close(handle)
251
252        infoMsg = "writing crawling results to a temporary file '%s' " % filename
253        logger.info(infoMsg)
254
255        with openFile(filename, "w+b") as f:
256            if conf.forms:
257                f.write("URL,POST\n")
258
259            for url, _, data, _, _ in results:
260                if conf.forms:
261                    f.write("%s,%s\n" % (safeCSValue(url), safeCSValue(data or "")))
262                else:
263                    f.write("%s\n" % url)
264