1#!/usr/bin/env python 2 3""" 4Copyright (c) 2006-2019 sqlmap developers (http://sqlmap.org/) 5See the file 'LICENSE' for copying permission 6""" 7 8from __future__ import division 9 10import os 11import re 12import tempfile 13import time 14 15from lib.core.common import checkSameHost 16from lib.core.common import clearConsoleLine 17from lib.core.common import dataToStdout 18from lib.core.common import extractRegexResult 19from lib.core.common import findPageForms 20from lib.core.common import getSafeExString 21from lib.core.common import openFile 22from lib.core.common import readInput 23from lib.core.common import safeCSValue 24from lib.core.common import urldecode 25from lib.core.compat import xrange 26from lib.core.convert import htmlUnescape 27from lib.core.data import conf 28from lib.core.data import kb 29from lib.core.data import logger 30from lib.core.datatype import OrderedSet 31from lib.core.enums import MKSTEMP_PREFIX 32from lib.core.exception import SqlmapConnectionException 33from lib.core.exception import SqlmapSyntaxException 34from lib.core.settings import CRAWL_EXCLUDE_EXTENSIONS 35from lib.core.threads import getCurrentThreadData 36from lib.core.threads import runThreads 37from lib.parse.sitemap import parseSitemap 38from lib.request.connect import Connect as Request 39from thirdparty import six 40from thirdparty.beautifulsoup.beautifulsoup import BeautifulSoup 41from thirdparty.six.moves import http_client as _http_client 42from thirdparty.six.moves import urllib as _urllib 43 44def crawl(target, post=None, cookie=None): 45 if not target: 46 return 47 48 try: 49 visited = set() 50 threadData = getCurrentThreadData() 51 threadData.shared.value = OrderedSet() 52 threadData.shared.formsFound = False 53 54 def crawlThread(): 55 threadData = getCurrentThreadData() 56 57 while kb.threadContinue: 58 with kb.locks.limit: 59 if threadData.shared.unprocessed: 60 current = threadData.shared.unprocessed.pop() 61 if current in visited: 62 continue 63 elif conf.crawlExclude and re.search(conf.crawlExclude, current): 64 dbgMsg = "skipping '%s'" % current 65 logger.debug(dbgMsg) 66 continue 67 else: 68 visited.add(current) 69 else: 70 break 71 72 content = None 73 try: 74 if current: 75 content = Request.getPage(url=current, post=post, cookie=None, crawling=True, raise404=False)[0] 76 except SqlmapConnectionException as ex: 77 errMsg = "connection exception detected ('%s'). skipping " % getSafeExString(ex) 78 errMsg += "URL '%s'" % current 79 logger.critical(errMsg) 80 except SqlmapSyntaxException: 81 errMsg = "invalid URL detected. skipping '%s'" % current 82 logger.critical(errMsg) 83 except _http_client.InvalidURL as ex: 84 errMsg = "invalid URL detected ('%s'). skipping " % getSafeExString(ex) 85 errMsg += "URL '%s'" % current 86 logger.critical(errMsg) 87 88 if not kb.threadContinue: 89 break 90 91 if isinstance(content, six.text_type): 92 try: 93 match = re.search(r"(?si)<html[^>]*>(.+)</html>", content) 94 if match: 95 content = "<html>%s</html>" % match.group(1) 96 97 soup = BeautifulSoup(content) 98 tags = soup('a') 99 100 tags += re.finditer(r'(?i)\s(href|src)=["\'](?P<href>[^>"\']+)', content) 101 tags += re.finditer(r'(?i)window\.open\(["\'](?P<href>[^)"\']+)["\']', content) 102 103 for tag in tags: 104 href = tag.get("href") if hasattr(tag, "get") else tag.group("href") 105 106 if href: 107 if threadData.lastRedirectURL and threadData.lastRedirectURL[0] == threadData.lastRequestUID: 108 current = threadData.lastRedirectURL[1] 109 url = _urllib.parse.urljoin(current, htmlUnescape(href)) 110 111 # flag to know if we are dealing with the same target host 112 _ = checkSameHost(url, target) 113 114 if conf.scope: 115 if not re.search(conf.scope, url, re.I): 116 continue 117 elif not _: 118 continue 119 120 if (extractRegexResult(r"\A[^?]+\.(?P<result>\w+)(\?|\Z)", url) or "").lower() not in CRAWL_EXCLUDE_EXTENSIONS: 121 with kb.locks.value: 122 threadData.shared.deeper.add(url) 123 if re.search(r"(.*?)\?(.+)", url) and not re.search(r"\?(v=)?\d+\Z", url) and not re.search(r"(?i)\.(js|css)(\?|\Z)", url): 124 threadData.shared.value.add(url) 125 except UnicodeEncodeError: # for non-HTML files 126 pass 127 except ValueError: # for non-valid links 128 pass 129 finally: 130 if conf.forms: 131 threadData.shared.formsFound |= len(findPageForms(content, current, False, True)) > 0 132 133 if conf.verbose in (1, 2): 134 threadData.shared.count += 1 135 status = '%d/%d links visited (%d%%)' % (threadData.shared.count, threadData.shared.length, round(100.0 * threadData.shared.count / threadData.shared.length)) 136 dataToStdout("\r[%s] [INFO] %s" % (time.strftime("%X"), status), True) 137 138 threadData.shared.deeper = set() 139 threadData.shared.unprocessed = set([target]) 140 141 _ = re.sub(r"(?<!/)/(?!/).*", "", target) 142 if _: 143 if target.strip('/') != _.strip('/'): 144 threadData.shared.unprocessed.add(_) 145 146 if re.search(r"\?.*\b\w+=", target): 147 threadData.shared.value.add(target) 148 149 if kb.checkSitemap is None: 150 message = "do you want to check for the existence of " 151 message += "site's sitemap(.xml) [y/N] " 152 kb.checkSitemap = readInput(message, default='N', boolean=True) 153 154 if kb.checkSitemap: 155 found = True 156 items = None 157 url = _urllib.parse.urljoin(target, "/sitemap.xml") 158 try: 159 items = parseSitemap(url) 160 except SqlmapConnectionException as ex: 161 if "page not found" in getSafeExString(ex): 162 found = False 163 logger.warn("'sitemap.xml' not found") 164 except: 165 pass 166 finally: 167 if found: 168 if items: 169 for item in items: 170 if re.search(r"(.*?)\?(.+)", item): 171 threadData.shared.value.add(item) 172 if conf.crawlDepth > 1: 173 threadData.shared.unprocessed.update(items) 174 logger.info("%s links found" % ("no" if not items else len(items))) 175 176 if not conf.bulkFile: 177 infoMsg = "starting crawler for target URL '%s'" % target 178 logger.info(infoMsg) 179 180 for i in xrange(conf.crawlDepth): 181 threadData.shared.count = 0 182 threadData.shared.length = len(threadData.shared.unprocessed) 183 numThreads = min(conf.threads, len(threadData.shared.unprocessed)) 184 185 if not conf.bulkFile: 186 logger.info("searching for links with depth %d" % (i + 1)) 187 188 runThreads(numThreads, crawlThread, threadChoice=(i > 0)) 189 clearConsoleLine(True) 190 191 if threadData.shared.deeper: 192 threadData.shared.unprocessed = set(threadData.shared.deeper) 193 else: 194 break 195 196 except KeyboardInterrupt: 197 warnMsg = "user aborted during crawling. sqlmap " 198 warnMsg += "will use partial list" 199 logger.warn(warnMsg) 200 201 finally: 202 clearConsoleLine(True) 203 204 if not threadData.shared.value: 205 if not (conf.forms and threadData.shared.formsFound): 206 warnMsg = "no usable links found (with GET parameters)" 207 if conf.forms: 208 warnMsg += " or forms" 209 logger.warn(warnMsg) 210 else: 211 for url in threadData.shared.value: 212 kb.targets.add((urldecode(url, kb.pageEncoding), None, None, None, None)) 213 214 if kb.targets: 215 if kb.normalizeCrawlingChoice is None: 216 message = "do you want to normalize " 217 message += "crawling results [Y/n] " 218 219 kb.normalizeCrawlingChoice = readInput(message, default='Y', boolean=True) 220 221 if kb.normalizeCrawlingChoice: 222 seen = set() 223 results = OrderedSet() 224 225 for target in kb.targets: 226 value = "%s%s%s" % (target[0], '&' if '?' in target[0] else '?', target[2] or "") 227 match = re.search(r"/[^/?]*\?.+\Z", value) 228 if match: 229 key = re.sub(r"=[^=&]*", "=", match.group(0)).strip("&?") 230 if '=' in key and key not in seen: 231 results.add(target) 232 seen.add(key) 233 234 kb.targets = results 235 236 storeResultsToFile(kb.targets) 237 238def storeResultsToFile(results): 239 if not results: 240 return 241 242 if kb.storeCrawlingChoice is None: 243 message = "do you want to store crawling results to a temporary file " 244 message += "for eventual further processing with other tools [y/N] " 245 246 kb.storeCrawlingChoice = readInput(message, default='N', boolean=True) 247 248 if kb.storeCrawlingChoice: 249 handle, filename = tempfile.mkstemp(prefix=MKSTEMP_PREFIX.CRAWLER, suffix=".csv" if conf.forms else ".txt") 250 os.close(handle) 251 252 infoMsg = "writing crawling results to a temporary file '%s' " % filename 253 logger.info(infoMsg) 254 255 with openFile(filename, "w+b") as f: 256 if conf.forms: 257 f.write("URL,POST\n") 258 259 for url, _, data, _, _ in results: 260 if conf.forms: 261 f.write("%s,%s\n" % (safeCSValue(url), safeCSValue(data or ""))) 262 else: 263 f.write("%s\n" % url) 264