1#!/usr/local/bin/python 2# vim: et sw=4 ts=4: 3# -*- coding: utf-8 -*- 4# 5# Matomo - free/libre analytics platform 6# 7# @link https://matomo.org 8# @license https://www.gnu.org/licenses/gpl-3.0.html GPL v3 or later 9# @version $Id$ 10# 11# For more info see: https://matomo.org/log-analytics/ and https://matomo.org/docs/log-analytics-tool-how-to/ 12# 13# Requires Python 3.5, 3.6 or 3.7 14# 15from __future__ import print_function # this is needed that python2 can run the script until the warning below 16 17import sys 18 19if sys.version_info[0] != 3: 20 print('The log importer currently does not support Python 2 any more.') 21 print('Please use Python 3.5, 3.6, 3.7 or 3.8') 22 sys.exit(1) 23 24import base64 25import bz2 26import configparser 27import codecs 28import datetime 29import fnmatch 30import gzip 31import hashlib 32import http.client 33import inspect 34import itertools 35import json 36import logging 37import argparse 38import os 39import os.path 40import queue 41import re 42import ssl 43import sys 44import threading 45import time 46import urllib.request, urllib.parse, urllib.error 47import urllib.request, urllib.error, urllib.parse 48import urllib.parse 49import subprocess 50import traceback 51import socket 52import textwrap 53import collections 54import glob 55import io 56 57# Avoid "got more than 100 headers" error 58http.client._MAXHEADERS = 1000 59 60## 61## Constants. 62## 63 64STATIC_EXTENSIONS = set(( 65 'gif jpg jpeg png bmp ico svg svgz ttf otf eot woff woff2 class swf css js xml webp' 66).split()) 67 68STATIC_FILES = set(( 69 'robots.txt' 70).split()) 71 72DOWNLOAD_EXTENSIONS = set(( 73 '7z aac arc arj asf asx avi bin csv deb dmg doc docx exe flac flv gz gzip hqx ' 74 'ibooks jar json mpg mp2 mp3 mp4 mpeg mov movie msi msp odb odf odg odp ' 75 'ods odt ogg ogv pdf phps ppt pptx qt qtm ra ram rar rpm rtf sea sit tar tbz ' 76 'bz2 tbz tgz torrent txt wav webm wma wmv wpd xls xlsx xml xsd z zip ' 77 'azw3 epub mobi apk ' 78 'md5 sig' 79).split()) 80 81# If you want to add more bots, take a look at the Matomo Device Detector botlist: 82# https://github.com/matomo-org/device-detector/blob/master/regexes/bots.yml 83# user agents must be lowercase 84EXCLUDED_USER_AGENTS = ( 85 'adsbot-google', 86 'ask jeeves', 87 'baidubot', 88 'bot-', 89 'bot/', 90 'ccooter/', 91 'crawl', 92 'curl', 93 'echoping', 94 'exabot', 95 'feed', 96 'googlebot', 97 'ia_archiver', 98 'java/', 99 'libwww', 100 'mediapartners-google', 101 'msnbot', 102 'netcraftsurvey', 103 'panopta', 104 'pingdom.com_bot_', 105 'robot', 106 'spider', 107 'surveybot', 108 'twiceler', 109 'voilabot', 110 'yahoo', 111 'yandex', 112 'zabbix', 113 'googlestackdrivermonitoring', 114) 115 116MATOMO_DEFAULT_MAX_ATTEMPTS = 3 117MATOMO_DEFAULT_DELAY_AFTER_FAILURE = 10 118DEFAULT_SOCKET_TIMEOUT = 300 119 120MATOMO_EXPECTED_IMAGE = base64.b64decode( 121 'R0lGODlhAQABAIAAAAAAAAAAACH5BAEAAAAALAAAAAABAAEAAAICRAEAOw==' 122) 123 124## 125## Formats. 126## 127 128class BaseFormatException(Exception): pass 129 130class BaseFormat: 131 def __init__(self, name): 132 self.name = name 133 self.regex = None 134 self.date_format = '%d/%b/%Y:%H:%M:%S' 135 136 def check_format(self, file): 137 line = file.readline() 138 try: 139 file.seek(0) 140 except IOError: 141 pass 142 143 return self.check_format_line(line) 144 145 def check_format_line(self, line): 146 return False 147 148class JsonFormat(BaseFormat): 149 def __init__(self, name): 150 super(JsonFormat, self).__init__(name) 151 self.json = None 152 self.date_format = '%Y-%m-%dT%H:%M:%S' 153 154 def check_format_line(self, line): 155 try: 156 self.json = json.loads(line) 157 return True 158 except: 159 return False 160 161 def match(self, line): 162 try: 163 # nginx outputs malformed JSON w/ hex escapes when confronted w/ non-UTF input. we have to 164 # workaround this by converting hex escapes in strings to unicode escapes. the conversion is naive, 165 # so it does not take into account the string's actual encoding (which we don't have access to). 166 line = line.replace('\\x', '\\u00') 167 168 self.json = json.loads(line) 169 return self 170 except: 171 self.json = None 172 return None 173 174 def get(self, key): 175 # Some ugly patchs ... 176 if key == 'generation_time_milli': 177 self.json[key] = int(float(self.json[key]) * 1000) 178 # Patch date format ISO 8601 179 elif key == 'date': 180 tz = self.json[key][19:] 181 self.json['timezone'] = tz.replace(':', '') 182 self.json[key] = self.json[key][:19] 183 184 try: 185 return self.json[key] 186 except KeyError: 187 raise BaseFormatException() 188 189 def get_all(self,): 190 return self.json 191 192 def remove_ignored_groups(self, groups): 193 for group in groups: 194 del self.json[group] 195 196class RegexFormat(BaseFormat): 197 198 def __init__(self, name, regex, date_format=None): 199 super(RegexFormat, self).__init__(name) 200 if regex is not None: 201 self.regex = re.compile(regex) 202 if date_format is not None: 203 self.date_format = date_format 204 self.matched = None 205 206 def check_format_line(self, line): 207 return self.match(line) 208 209 def match(self,line): 210 if not self.regex: 211 return None 212 match_result = self.regex.match(line) 213 if match_result: 214 self.matched = match_result.groupdict() 215 if 'time' in self.matched: 216 self.matched['date'] = self.matched['date'] + ' ' + self.matched['time'] 217 del self.matched['time'] 218 else: 219 self.matched = None 220 return match_result 221 222 def get(self, key): 223 try: 224 return self.matched[key] 225 except KeyError: 226 raise BaseFormatException("Cannot find group '%s'." % key) 227 228 def get_all(self,): 229 return self.matched 230 231 def remove_ignored_groups(self, groups): 232 for group in groups: 233 del self.matched[group] 234 235class W3cExtendedFormat(RegexFormat): 236 237 FIELDS_LINE_PREFIX = '#Fields: ' 238 REGEX_UNKNOWN_FIELD = r'(?:".*?"|\S+)' 239 240 fields = { 241 'date': r'"?(?P<date>\d+[-\d+]+)"?', 242 'time': r'"?(?P<time>[\d+:]+)[.\d]*?"?', 243 'cs-uri-stem': r'(?P<path>/\S*)', 244 'cs-uri-query': r'(?P<query_string>\S*)', 245 'c-ip': r'"?(?P<ip>[\w*.:-]*)"?', 246 'cs(User-Agent)': r'(?P<user_agent>".*?"|\S*)', 247 'cs(Referer)': r'(?P<referrer>\S+)', 248 'sc-status': r'(?P<status>\d+)', 249 'sc-bytes': r'(?P<length>\S+)', 250 'cs-host': r'(?P<host>\S+)', 251 'cs-method': r'(?P<method>\S+)', 252 'cs-username': r'(?P<userid>\S+)', 253 'time-taken': r'(?P<generation_time_secs>[.\d]+)' 254 } 255 256 def __init__(self): 257 super(W3cExtendedFormat, self).__init__('w3c_extended', None, '%Y-%m-%d %H:%M:%S') 258 259 def check_format(self, file): 260 try: 261 file.seek(0) 262 except IOError: 263 pass 264 265 self.create_regex(file) 266 267 # if we couldn't create a regex, this file does not follow the W3C extended log file format 268 if not self.regex: 269 try: 270 file.seek(0) 271 except IOError: 272 pass 273 274 return 275 276 first_line = file.readline() 277 278 try: 279 file.seek(0) 280 except IOError: 281 pass 282 283 return self.check_format_line(first_line) 284 285 def create_regex(self, file): 286 fields_line = None 287 if config.options.w3c_fields: 288 fields_line = config.options.w3c_fields 289 290 # collect all header lines up until the Fields: line 291 # if we're reading from stdin, we can't seek, so don't read any more than the Fields line 292 header_lines = [] 293 while fields_line is None: 294 line = file.readline().strip() 295 296 if not line: 297 continue 298 299 if not line.startswith('#'): 300 break 301 302 if line.startswith(self.FIELDS_LINE_PREFIX): 303 fields_line = line 304 else: 305 header_lines.append(line) 306 307 if not fields_line: 308 return 309 310 # store the header lines for a later check for IIS 311 self.header_lines = header_lines 312 313 # Parse the 'Fields: ' line to create the regex to use 314 full_regex = [] 315 316 expected_fields = type(self).fields.copy() # turn custom field mapping into field => regex mapping 317 318 # if the --w3c-time-taken-millisecs option is used, make sure the time-taken field is interpreted as milliseconds 319 if config.options.w3c_time_taken_in_millisecs: 320 expected_fields['time-taken'] = r'(?P<generation_time_milli>[\d.]+)' 321 322 for mapped_field_name, field_name in config.options.custom_w3c_fields.items(): 323 expected_fields[mapped_field_name] = expected_fields[field_name] 324 del expected_fields[field_name] 325 326 # add custom field regexes supplied through --w3c-field-regex option 327 for field_name, field_regex in config.options.w3c_field_regexes.items(): 328 expected_fields[field_name] = field_regex 329 330 # Skip the 'Fields: ' prefix. 331 fields_line = fields_line[9:].strip() 332 for field in re.split(r'\s+', fields_line): 333 try: 334 regex = expected_fields[field] 335 except KeyError: 336 regex = self.REGEX_UNKNOWN_FIELD 337 full_regex.append(regex) 338 full_regex = r'\s+'.join(full_regex) 339 340 logging.debug("Based on 'Fields:' line, computed regex to be %s", full_regex) 341 342 self.regex = re.compile(full_regex) 343 344 def check_for_iis_option(self): 345 if not config.options.w3c_time_taken_in_millisecs and self._is_time_taken_milli() and self._is_iis(): 346 logging.info("WARNING: IIS log file being parsed without --w3c-time-taken-milli option. IIS" 347 " stores millisecond values in the time-taken field. If your logfile does this, the aforementioned" 348 " option must be used in order to get accurate generation times.") 349 350 def _is_iis(self): 351 return len([line for line in self.header_lines if 'internet information services' in line.lower() or 'iis' in line.lower()]) > 0 352 353 def _is_time_taken_milli(self): 354 return 'generation_time_milli' not in self.regex.pattern 355 356class IisFormat(W3cExtendedFormat): 357 358 fields = W3cExtendedFormat.fields.copy() 359 fields.update({ 360 'time-taken': r'(?P<generation_time_milli>[.\d]+)', 361 'sc-win32-status': r'(?P<__win32_status>\S+)' # this group is useless for log importing, but capturing it 362 # will ensure we always select IIS for the format instead of 363 # W3C logs when detecting the format. This way there will be 364 # less accidental importing of IIS logs w/o --w3c-time-taken-milli. 365 }) 366 367 def __init__(self): 368 super(IisFormat, self).__init__() 369 370 self.name = 'iis' 371 372class IncapsulaW3CFormat(W3cExtendedFormat): 373 374 # use custom unknown field regex to make resulting regex much simpler 375 REGEX_UNKNOWN_FIELD = r'".*?"' 376 377 fields = W3cExtendedFormat.fields.copy() 378 # redefines all fields as they are always encapsulated with " 379 fields.update({ 380 'cs-uri': r'"(?P<host>[^\/\s]+)(?P<path>\S+)"', 381 'cs-uri-query': r'"(?P<query_string>\S*)"', 382 'c-ip': r'"(?P<ip>[\w*.:-]*)"', 383 'cs(User-Agent)': r'"(?P<user_agent>.*?)"', 384 'cs(Referer)': r'"(?P<referrer>\S+)"', 385 'sc-status': r'(?P<status>"\d*")', 386 'cs-bytes': r'(?P<length>"\d*")', 387 }) 388 389 def __init__(self): 390 super(IncapsulaW3CFormat, self).__init__() 391 392 self.name = 'incapsula_w3c' 393 394 def get(self, key): 395 value = super(IncapsulaW3CFormat, self).get(key) 396 if key == 'status' or key == 'length': 397 value = value.strip('"') 398 if key == 'status' and value == '': 399 value = '200' 400 return value 401 402class ShoutcastFormat(W3cExtendedFormat): 403 404 fields = W3cExtendedFormat.fields.copy() 405 fields.update({ 406 'c-status': r'(?P<status>\d+)', 407 'x-duration': r'(?P<generation_time_secs>[.\d]+)' 408 }) 409 410 def __init__(self): 411 super(ShoutcastFormat, self).__init__() 412 413 self.name = 'shoutcast' 414 415 def get(self, key): 416 if key == 'user_agent': 417 user_agent = super(ShoutcastFormat, self).get(key) 418 return urllib.parse.unquote(user_agent) 419 else: 420 return super(ShoutcastFormat, self).get(key) 421 422class AmazonCloudFrontFormat(W3cExtendedFormat): 423 424 fields = W3cExtendedFormat.fields.copy() 425 fields.update({ 426 'x-event': r'(?P<event_action>\S+)', 427 'x-sname': r'(?P<event_name>\S+)', 428 'cs-uri-stem': r'(?:rtmp:/)?(?P<path>/\S*)', 429 'c-user-agent': r'(?P<user_agent>".*?"|\S+)', 430 431 # following are present to match cloudfront instead of W3C when we know it's cloudfront 432 'x-edge-location': r'(?P<x_edge_location>".*?"|\S+)', 433 'x-edge-result-type': r'(?P<x_edge_result_type>".*?"|\S+)', 434 'x-edge-request-id': r'(?P<x_edge_request_id>".*?"|\S+)', 435 'x-host-header': r'(?P<host>".*?"|\S+)' 436 }) 437 438 def __init__(self): 439 super(AmazonCloudFrontFormat, self).__init__() 440 441 self.name = 'amazon_cloudfront' 442 443 def get(self, key): 444 if key == 'event_category' and 'event_category' not in self.matched: 445 return 'cloudfront_rtmp' 446 elif key == 'status' and 'status' not in self.matched: 447 return '200' 448 elif key == 'user_agent': 449 user_agent = super(AmazonCloudFrontFormat, self).get(key) 450 return urllib.parse.unquote(urllib.parse.unquote(user_agent)) # Value is double quoted! 451 else: 452 return super(AmazonCloudFrontFormat, self).get(key) 453 454_HOST_PREFIX = r'(?P<host>[\w\-\.]*)(?::\d+)?\s+' 455 456_COMMON_LOG_FORMAT = ( 457 r'(?P<ip>[\w*.:-]+)\s+\S+\s+(?P<userid>\S+)\s+\[(?P<date>.*?)\s+(?P<timezone>.*?)\]\s+' 458 r'"(?P<method>\S+)\s+(?P<path>.*?)\s+\S+"\s+(?P<status>\d+)\s+(?P<length>\S+)' 459) 460_NCSA_EXTENDED_LOG_FORMAT = (_COMMON_LOG_FORMAT + 461 r'\s+"(?P<referrer>.*?)"\s+"(?P<user_agent>.*?)"' 462) 463 464 465_S3_LOG_FORMAT = ( 466 r'\S+\s+(?P<host>\S+)\s+\[(?P<date>.*?)\s+(?P<timezone>.*?)\]\s+(?P<ip>[\w*.:-]+)\s+' 467 r'(?P<userid>\S+)\s+\S+\s+\S+\s+\S+\s+"(?P<method>\S+)\s+(?P<path>.*?)\s+\S+"\s+(?P<status>\d+)\s+\S+\s+(?P<length>\S+)\s+' 468 r'\S+\s+\S+\s+\S+\s+"(?P<referrer>.*?)"\s+"(?P<user_agent>.*?)"' 469) 470_ICECAST2_LOG_FORMAT = ( _NCSA_EXTENDED_LOG_FORMAT + 471 r'\s+(?P<session_time>[0-9-]+)' 472) 473_ELB_LOG_FORMAT = ( 474 r'(?:\S+\s+)?(?P<date>[0-9-]+T[0-9:]+)\.\S+\s+\S+\s+(?P<ip>[\w*.:-]+):\d+\s+\S+:\d+\s+\S+\s+(?P<generation_time_secs>\S+)\s+\S+\s+' 475 r'(?P<status>\d+)\s+\S+\s+\S+\s+(?P<length>\S+)\s+' 476 r'"\S+\s+\w+:\/\/(?P<host>[\w\-\.]*):\d+(?P<path>\/\S*)\s+[^"]+"\s+"(?P<user_agent>[^"]+)"\s+\S+\s+\S+' 477) 478 479_OVH_FORMAT = ( 480 r'(?P<ip>\S+)\s+' + _HOST_PREFIX + r'(?P<userid>\S+)\s+\[(?P<date>.*?)\s+(?P<timezone>.*?)\]\s+' 481 r'"\S+\s+(?P<path>.*?)\s+\S+"\s+(?P<status>\S+)\s+(?P<length>\S+)' 482 r'\s+"(?P<referrer>.*?)"\s+"(?P<user_agent>.*?)"' 483) 484 485_HAPROXY_FORMAT = ( 486 r'.*:\ (?P<ip>[\w*.]+).*\[(?P<date>.*)\].*\ (?P<status>\b\d{3}\b)\ (?P<length>\d+)\ -.*\"(?P<method>\S+)\ (?P<path>\S+).*' 487) 488 489_GANDI_SIMPLE_HOSTING_FORMAT = ( 490 r'(?P<host>[0-9a-zA-Z-_.]+)\s+(?P<ip>[a-zA-Z0-9.]+)\s+\S+\s+(?P<userid>\S+)\s+\[(?P<date>.+?)\s+(?P<timezone>.+?)\]\s+\((?P<generation_time_secs>[0-9a-zA-Z\s]*)\)\s+"(?P<method>[A-Z]+)\s+(?P<path>\S+)\s+(\S+)"\s+(?P<status>[0-9]+)\s+(?P<length>\S+)\s+"(?P<referrer>\S+)"\s+"(?P<user_agent>[^"]+)"' 491) 492 493FORMATS = { 494 'common': RegexFormat('common', _COMMON_LOG_FORMAT), 495 'common_vhost': RegexFormat('common_vhost', _HOST_PREFIX + _COMMON_LOG_FORMAT), 496 'ncsa_extended': RegexFormat('ncsa_extended', _NCSA_EXTENDED_LOG_FORMAT), 497 'common_complete': RegexFormat('common_complete', _HOST_PREFIX + _NCSA_EXTENDED_LOG_FORMAT), 498 'w3c_extended': W3cExtendedFormat(), 499 'amazon_cloudfront': AmazonCloudFrontFormat(), 500 'incapsula_w3c': IncapsulaW3CFormat(), 501 'iis': IisFormat(), 502 'shoutcast': ShoutcastFormat(), 503 's3': RegexFormat('s3', _S3_LOG_FORMAT), 504 'icecast2': RegexFormat('icecast2', _ICECAST2_LOG_FORMAT), 505 'elb': RegexFormat('elb', _ELB_LOG_FORMAT, '%Y-%m-%dT%H:%M:%S'), 506 'nginx_json': JsonFormat('nginx_json'), 507 'ovh': RegexFormat('ovh', _OVH_FORMAT), 508 'haproxy': RegexFormat('haproxy', _HAPROXY_FORMAT, '%d/%b/%Y:%H:%M:%S.%f'), 509 'gandi': RegexFormat('gandi', _GANDI_SIMPLE_HOSTING_FORMAT, '%d/%b/%Y:%H:%M:%S') 510} 511 512## 513## Code. 514## 515 516class StoreDictKeyPair(argparse.Action): 517 def __call__(self, parser, namespace, values, option_string=None): 518 my_dict = getattr(namespace, self.dest, None) 519 if not my_dict: 520 my_dict = {} 521 for kv in values.split(","): 522 k,v = kv.split("=") 523 my_dict[k] = v 524 setattr(namespace, self.dest, my_dict) 525 526class Configuration: 527 """ 528 Stores all the configuration options by reading sys.argv and parsing, 529 if needed, the config.inc.php. 530 531 It has 2 attributes: options and filenames. 532 """ 533 534 class Error(Exception): 535 pass 536 537 def _create_parser(self): 538 """ 539 Initialize and return the OptionParser instance. 540 """ 541 parser = argparse.ArgumentParser( 542 # usage='Usage: %prog [options] log_file [ log_file [...] ]', 543 description="Import HTTP access logs to Matomo. " 544 "log_file is the path to a server access log file (uncompressed, .gz, .bz2, or specify - to read from stdin). " 545 " You may also import many log files at once (for example set log_file to *.log or *.log.gz)." 546 " By default, the script will try to produce clean reports and will exclude bots, static files, discard http error and redirects, etc. This is customizable, see below.", 547 epilog="About Matomo Server Log Analytics: https://matomo.org/log-analytics/ " 548 " Found a bug? Please create a ticket in https://github.com/matomo-org/matomo-log-analytics/ " 549 " Please send your suggestions or successful user story to hello@matomo.org " 550 ) 551 552 parser.add_argument('file', type=str, nargs='+') 553 554 # Basic auth user 555 parser.add_argument( 556 '--auth-user', dest='auth_user', 557 help="Basic auth user", 558 ) 559 # Basic auth password 560 parser.add_argument( 561 '--auth-password', dest='auth_password', 562 help="Basic auth password", 563 ) 564 parser.add_argument( 565 '--debug', '-d', dest='debug', action='count', default=0, 566 help="Enable debug output (specify multiple times for more verbose)", 567 ) 568 parser.add_argument( 569 '--debug-tracker', dest='debug_tracker', action='store_true', default=False, 570 help="Appends &debug=1 to tracker requests and prints out the result so the tracker can be debugged. If " 571 "using the log importer results in errors with the tracker or improperly recorded visits, this option can " 572 "be used to find out what the tracker is doing wrong. To see debug tracker output, you must also set the " 573 "[Tracker] debug_on_demand INI config to 1 in your Matomo's config.ini.php file." 574 ) 575 parser.add_argument( 576 '--debug-request-limit', dest='debug_request_limit', type=int, default=None, 577 help="Debug option that will exit after N requests are parsed. Can be used w/ --debug-tracker to limit the " 578 "output of a large log file." 579 ) 580 parser.add_argument( 581 '--url', dest='matomo_url', required=True, 582 help="REQUIRED Your Matomo server URL, eg. https://example.com/matomo/ or https://analytics.example.net", 583 ) 584 parser.add_argument( 585 '--api-url', dest='matomo_api_url', 586 help="This URL will be used to send API requests (use it if your tracker URL differs from UI/API url), " 587 "eg. https://other-example.com/matomo/ or https://analytics-api.example.net", 588 ) 589 parser.add_argument( 590 '--tracker-endpoint-path', dest='matomo_tracker_endpoint_path', default='/piwik.php', 591 help="The tracker endpoint path to use when tracking. Defaults to /piwik.php." 592 ) 593 parser.add_argument( 594 '--dry-run', dest='dry_run', 595 action='store_true', default=False, 596 help="Perform a trial run with no tracking data being inserted into Matomo", 597 ) 598 parser.add_argument( 599 '--show-progress', dest='show_progress', 600 action='store_true', default=hasattr(sys.stdout, 'fileno') and os.isatty(sys.stdout.fileno()), 601 help="Print a progress report X seconds (default: 1, use --show-progress-delay to override)" 602 ) 603 parser.add_argument( 604 '--show-progress-delay', dest='show_progress_delay', 605 type=int, default=1, 606 help="Change the default progress delay" 607 ) 608 parser.add_argument( 609 '--add-sites-new-hosts', dest='add_sites_new_hosts', 610 action='store_true', default=False, 611 help="When a hostname is found in the log file, but not matched to any website " 612 "in Matomo, automatically create a new website in Matomo with this hostname to " 613 "import the logs" 614 ) 615 parser.add_argument( 616 '--idsite', dest='site_id', 617 help= ("When specified, " 618 "data in the specified log files will be tracked for this Matomo site ID." 619 " The script will not auto-detect the website based on the log line hostname (new websites will not be automatically created).") 620 ) 621 parser.add_argument( 622 '--idsite-fallback', dest='site_id_fallback', 623 help="Default Matomo site ID to use if the hostname doesn't match any " 624 "known Website's URL. New websites will not be automatically created. " 625 " Used only if --add-sites-new-hosts or --idsite are not set", 626 ) 627 default_config = os.path.abspath( 628 os.path.join(os.path.dirname(__file__), 629 '../../config/config.ini.php'), 630 ) 631 parser.add_argument( 632 '--config', dest='config_file', default=default_config, 633 help=( 634 "This is only used when --login and --password is not used. " 635 "Matomo will read the configuration file (default: %(default)s) to " 636 "fetch the Super User token_auth from the config file. " 637 ) 638 ) 639 parser.add_argument( 640 '--login', dest='login', 641 help="You can manually specify the Matomo Super User login" 642 ) 643 parser.add_argument( 644 '--password', dest='password', 645 help="You can manually specify the Matomo Super User password" 646 ) 647 parser.add_argument( 648 '--token-auth', dest='matomo_token_auth', 649 help="Matomo user token_auth, the token_auth is found in Matomo > Settings > API. " 650 "You must use a token_auth that has at least 'admin' or 'super user' permission. " 651 "If you use a token_auth for a non admin user, your users' IP addresses will not be tracked properly. " 652 ) 653 654 parser.add_argument( 655 '--hostname', dest='hostnames', action='append', default=[], 656 help="Accepted hostname (requests with other hostnames will be excluded). " 657 " You may use the star character * " 658 " Example: --hostname=*domain.com" 659 " Can be specified multiple times" 660 ) 661 parser.add_argument( 662 '--exclude-path', dest='excluded_paths', action='append', default=[], 663 help="Any URL path matching this exclude-path will not be imported in Matomo. " 664 " You must use the star character *. " 665 " Example: --exclude-path=*/admin/*" 666 " Can be specified multiple times. " 667 ) 668 parser.add_argument( 669 '--exclude-path-from', dest='exclude_path_from', 670 help="Each line from this file is a path to exclude. Each path must contain the character * to match a string. (see: --exclude-path)" 671 ) 672 parser.add_argument( 673 '--include-path', dest='included_paths', action='append', default=[], 674 help="Paths to include. Can be specified multiple times. If not specified, all paths are included." 675 ) 676 parser.add_argument( 677 '--include-path-from', dest='include_path_from', 678 help="Each line from this file is a path to include" 679 ) 680 parser.add_argument( 681 '--useragent-exclude', dest='excluded_useragents', 682 action='append', default=[], 683 help="User agents to exclude (in addition to the standard excluded " 684 "user agents). Can be specified multiple times", 685 ) 686 parser.add_argument( 687 '--enable-static', dest='enable_static', 688 action='store_true', default=False, 689 help="Track static files (images, css, js, ico, ttf, etc.)" 690 ) 691 parser.add_argument( 692 '--enable-bots', dest='enable_bots', 693 action='store_true', default=False, 694 help="Track bots. All bot visits will have a Custom Variable set with name='Bot' and value='$Bot_user_agent_here$'" 695 ) 696 parser.add_argument( 697 '--enable-http-errors', dest='enable_http_errors', 698 action='store_true', default=False, 699 help="Track HTTP errors (status code 4xx or 5xx)" 700 ) 701 parser.add_argument( 702 '--enable-http-redirects', dest='enable_http_redirects', 703 action='store_true', default=False, 704 help="Track HTTP redirects (status code 3xx except 304)" 705 ) 706 parser.add_argument( 707 '--enable-reverse-dns', dest='reverse_dns', 708 action='store_true', default=False, 709 help="Enable reverse DNS, used to generate the 'Providers' report in Matomo. " 710 "Disabled by default, as it impacts performance" 711 ) 712 parser.add_argument( 713 '--strip-query-string', dest='strip_query_string', 714 action='store_true', default=False, 715 help="Strip the query string from the URL" 716 ) 717 parser.add_argument( 718 '--query-string-delimiter', dest='query_string_delimiter', default='?', 719 help="The query string delimiter (default: %(default)s)" 720 ) 721 parser.add_argument( 722 '--log-format-name', dest='log_format_name', default=None, 723 help=("Access log format to detect (supported are: %s). " 724 "When not specified, the log format will be autodetected by trying all supported log formats." 725 % ', '.join(sorted(FORMATS.keys()))) 726 ) 727 available_regex_groups = ['date', 'path', 'query_string', 'ip', 'user_agent', 'referrer', 'status', 728 'length', 'host', 'userid', 'generation_time_milli', 'event_action', 729 'event_name', 'timezone', 'session_time'] 730 parser.add_argument( 731 '--log-format-regex', dest='log_format_regex', default=None, 732 help="Regular expression used to parse log entries. Regexes must contain named groups for different log fields. " 733 "Recognized fields include: %s. For an example of a supported Regex, see the source code of this file. " 734 "Overrides --log-format-name." % (', '.join(available_regex_groups)) 735 ) 736 parser.add_argument( 737 '--log-date-format', dest='log_date_format', default=None, 738 help="Format string used to parse dates. You can specify any format that can also be specified to " 739 "the strptime python function." 740 ) 741 parser.add_argument( 742 '--log-hostname', dest='log_hostname', default=None, 743 help="Force this hostname for a log format that doesn't include it. All hits " 744 "will seem to come to this host" 745 ) 746 parser.add_argument( 747 '--skip', dest='skip', default=0, type=int, 748 help="Skip the n first lines to start parsing/importing data at a given line for the specified log file", 749 ) 750 parser.add_argument( 751 '--recorders', dest='recorders', default=1, type=int, 752 help="Number of simultaneous recorders (default: %(default)s). " 753 "It should be set to the number of CPU cores in your server. " 754 "You can also experiment with higher values which may increase performance until a certain point", 755 ) 756 parser.add_argument( 757 '--recorder-max-payload-size', dest='recorder_max_payload_size', default=200, type=int, 758 help="Maximum number of log entries to record in one tracking request (default: %(default)s). " 759 ) 760 parser.add_argument( 761 '--replay-tracking', dest='replay_tracking', 762 action='store_true', default=False, 763 help="Replay piwik.php requests found in custom logs (only piwik.php requests expected). \nSee https://matomo.org/faq/how-to/faq_17033/" 764 ) 765 parser.add_argument( 766 '--replay-tracking-expected-tracker-file', dest='replay_tracking_expected_tracker_file', default=None, 767 help="The expected suffix for tracking request paths. Only logs whose paths end with this will be imported. By default " 768 "requests to the piwik.php file or the matomo.php file will be imported." 769 ) 770 parser.add_argument( 771 '--output', dest='output', 772 help="Redirect output (stdout and stderr) to the specified file" 773 ) 774 parser.add_argument( 775 '--encoding', dest='encoding', default='utf8', 776 help="Log files encoding (default: %(default)s)" 777 ) 778 parser.add_argument( 779 '--disable-bulk-tracking', dest='use_bulk_tracking', 780 default=True, action='store_false', 781 help="Disables use of bulk tracking so recorders record one hit at a time." 782 ) 783 parser.add_argument( 784 '--debug-force-one-hit-every-Ns', dest='force_one_action_interval', default=False, type=float, 785 help="Debug option that will force each recorder to record one hit every N secs." 786 ) 787 parser.add_argument( 788 '--force-lowercase-path', dest='force_lowercase_path', default=False, action='store_true', 789 help="Make URL path lowercase so paths with the same letters but different cases are " 790 "treated the same." 791 ) 792 parser.add_argument( 793 '--enable-testmode', dest='enable_testmode', default=False, action='store_true', 794 help="If set, it will try to get the token_auth from the matomo_tests directory" 795 ) 796 parser.add_argument( 797 '--download-extensions', dest='download_extensions', default=None, 798 help="By default Matomo tracks as Downloads the most popular file extensions. If you set this parameter (format: pdf,doc,...) then files with an extension found in the list will be imported as Downloads, other file extensions downloads will be skipped." 799 ) 800 parser.add_argument( 801 '--add-download-extensions', dest='extra_download_extensions', default=None, 802 help="Add extensions that should be treated as downloads. See --download-extensions for more info." 803 ) 804 parser.add_argument( 805 '--w3c-map-field', action=StoreDictKeyPair, metavar='KEY=VAL', default={}, dest="custom_w3c_fields", 806 help="Map a custom log entry field in your W3C log to a default one. Use this option to load custom log " 807 "files that use the W3C extended log format such as those from the Advanced Logging W3C module. Used " 808 "as, eg, --w3c-map-field my-date=date. Recognized default fields include: %s\n\n" 809 "Formats that extend the W3C extended log format (like the cloudfront RTMP log format) may define more " 810 "fields that can be mapped." 811 % (', '.join(list(W3cExtendedFormat.fields.keys()))) 812 ) 813 parser.add_argument( 814 '--w3c-time-taken-millisecs', action='store_true', default=False, dest='w3c_time_taken_in_millisecs', 815 help="If set, interprets the time-taken W3C log field as a number of milliseconds. This must be set for importing" 816 " IIS logs." 817 ) 818 parser.add_argument( 819 '--w3c-fields', dest='w3c_fields', default=None, 820 help="Specify the '#Fields:' line for a log file in the W3C Extended log file format. Use this option if " 821 "your log file doesn't contain the '#Fields:' line which is required for parsing. This option must be used " 822 "in conjunction with --log-format-name=w3c_extended.\n" 823 "Example: --w3c-fields='#Fields: date time c-ip ...'" 824 ) 825 parser.add_argument( 826 '--w3c-field-regex', action=StoreDictKeyPair, metavar='KEY=VAL', default={}, dest="w3c_field_regexes", type=str, 827 help="Specify a regex for a field in your W3C extended log file. You can use this option to parse fields the " 828 "importer does not natively recognize and then use one of the --regex-group-to-XXX-cvar options to track " 829 "the field in a custom variable. For example, specifying --w3c-field-regex=sc-win32-status=(?P<win32_status>\\S+) " 830 "--regex-group-to-page-cvar=\"win32_status=Windows Status Code\" will track the sc-win32-status IIS field " 831 "in the 'Windows Status Code' custom variable. Regexes must contain a named group." 832 ) 833 parser.add_argument( 834 '--title-category-delimiter', dest='title_category_delimiter', default='/', 835 help="If --enable-http-errors is used, errors are shown in the page titles report. If you have " 836 "changed General.action_title_category_delimiter in your Matomo configuration, you need to set this " 837 "option to the same value in order to get a pretty page titles report." 838 ) 839 parser.add_argument( 840 '--dump-log-regex', dest='dump_log_regex', action='store_true', default=False, 841 help="Prints out the regex string used to parse log lines and exists. Can be useful for using formats " 842 "in newer versions of the script in older versions of the script. The output regex can be used with " 843 "the --log-format-regex option." 844 ) 845 846 parser.add_argument( 847 '--ignore-groups', dest='regex_groups_to_ignore', default=None, 848 help="Comma separated list of regex groups to ignore when parsing log lines. Can be used to, for example, " 849 "disable normal user id tracking. See documentation for --log-format-regex for list of available " 850 "regex groups." 851 ) 852 853 parser.add_argument( 854 '--regex-group-to-visit-cvar', action=StoreDictKeyPair, metavar='KEY=VAL',dest='regex_group_to_visit_cvars_map', default={}, 855 help="Track an attribute through a custom variable with visit scope instead of through Matomo's normal " 856 "approach. For example, to track usernames as a custom variable instead of through the uid tracking " 857 "parameter, supply --regex-group-to-visit-cvar=\"userid=User Name\". This will track usernames in a " 858 "custom variable named 'User Name'. The list of available regex groups can be found in the documentation " 859 "for --log-format-regex (additional regex groups you may have defined " 860 "in --log-format-regex can also be used)." 861 ) 862 parser.add_argument( 863 '--regex-group-to-page-cvar', action=StoreDictKeyPair, metavar='KEY=VAL', dest='regex_group_to_page_cvars_map', default={}, 864 help="Track an attribute through a custom variable with page scope instead of through Matomo's normal " 865 "approach. For example, to track usernames as a custom variable instead of through the uid tracking " 866 "parameter, supply --regex-group-to-page-cvar=\"userid=User Name\". This will track usernames in a " 867 "custom variable named 'User Name'. The list of available regex groups can be found in the documentation " 868 "for --log-format-regex (additional regex groups you may have defined " 869 "in --log-format-regex can also be used)." 870 ) 871 parser.add_argument( 872 '--track-http-method', dest='track_http_method', default=False, 873 help="Enables tracking of http method as custom page variable if method group is available in log format." 874 ) 875 parser.add_argument( 876 '--retry-max-attempts', dest='max_attempts', default=MATOMO_DEFAULT_MAX_ATTEMPTS, type=int, 877 help="The maximum number of times to retry a failed tracking request." 878 ) 879 parser.add_argument( 880 '--retry-delay', dest='delay_after_failure', default=MATOMO_DEFAULT_DELAY_AFTER_FAILURE, type=int, 881 help="The number of seconds to wait before retrying a failed tracking request." 882 ) 883 parser.add_argument( 884 '--request-timeout', dest='request_timeout', default=DEFAULT_SOCKET_TIMEOUT, type=int, 885 help="The maximum number of seconds to wait before terminating an HTTP request to Matomo." 886 ) 887 parser.add_argument( 888 '--include-host', action='append', type=str, 889 help="Only import logs from the specified host(s)." 890 ) 891 parser.add_argument( 892 '--exclude-host', action='append', type=str, 893 help="Only import logs that are not from the specified host(s)." 894 ) 895 parser.add_argument( 896 '--exclude-older-than', type=self._valid_date, default=None, 897 help="Ignore logs older than the specified date. Exclusive. Date format must be YYYY-MM-DD hh:mm:ss +/-0000. The timezone offset is required." 898 ) 899 parser.add_argument( 900 '--exclude-newer-than', type=self._valid_date, default=None, 901 help="Ignore logs newer than the specified date. Exclusive. Date format must be YYYY-MM-DD hh:mm:ss +/-0000. The timezone offset is required." 902 ) 903 parser.add_argument( 904 '--add-to-date', dest='seconds_to_add_to_date', default=0, type=int, 905 help="A number of seconds to add to each date value in the log file." 906 ) 907 parser.add_argument( 908 '--request-suffix', dest='request_suffix', default=None, type=str, help="Extra parameters to append to tracker and API requests." 909 ) 910 parser.add_argument( 911 '--accept-invalid-ssl-certificate', 912 dest='accept_invalid_ssl_certificate', action='store_true', 913 default=False, 914 help="Do not verify the SSL / TLS certificate when contacting the Matomo server." 915 ) 916 parser.add_argument( 917 '--php-binary', dest='php_binary', type=str, default='php', 918 help="Specify the PHP binary to use.", 919 ) 920 return parser 921 922 def _valid_date(self, value): 923 try: 924 (date_str, timezone) = value.rsplit(' ', 1) 925 except: 926 raise argparse.ArgumentTypeError("Invalid date value '%s'." % value) 927 928 if not re.match('[-+][0-9]{4}', timezone): 929 raise argparse.ArgumentTypeError("Invalid date value '%s': expected valid timzeone like +0100 or -1200, got '%s'" % (value, timezone)) 930 931 date = datetime.datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S') 932 date -= TimeHelper.timedelta_from_timezone(timezone) 933 934 return date 935 936 def _parse_args(self, option_parser, argv = None): 937 """ 938 Parse the command line args and create self.options and self.filenames. 939 """ 940 if not argv: 941 argv = sys.argv[1:] 942 943 self.options = option_parser.parse_args(argv) 944 self.filenames = self.options.file 945 946 if self.options.output: 947 sys.stdout = sys.stderr = open(self.options.output, 'a') 948 949 all_filenames = [] 950 for self.filename in self.filenames: 951 if self.filename == '-': 952 all_filenames.append(self.filename) 953 else: 954 all_filenames = all_filenames + sorted(glob.glob(self.filename)) 955 self.filenames = all_filenames 956 957 # Configure logging before calling logging.{debug,info}. 958 logging.basicConfig( 959 format='%(asctime)s: [%(levelname)s] %(message)s', 960 level=logging.DEBUG if self.options.debug >= 1 else logging.INFO, 961 ) 962 963 self.options.excluded_useragents = set([s.lower() for s in self.options.excluded_useragents]) 964 965 if self.options.exclude_path_from: 966 paths = [path.strip() for path in open(self.options.exclude_path_from).readlines()] 967 self.options.excluded_paths.extend(path for path in paths if len(path) > 0) 968 if self.options.excluded_paths: 969 self.options.excluded_paths = set(self.options.excluded_paths) 970 logging.debug('Excluded paths: %s', ' '.join(self.options.excluded_paths)) 971 972 if self.options.include_path_from: 973 paths = [path.strip() for path in open(self.options.include_path_from).readlines()] 974 self.options.included_paths.extend(path for path in paths if len(path) > 0) 975 if self.options.included_paths: 976 self.options.included_paths = set(self.options.included_paths) 977 logging.debug('Included paths: %s', ' '.join(self.options.included_paths)) 978 979 if self.options.hostnames: 980 logging.debug('Accepted hostnames: %s', ', '.join(self.options.hostnames)) 981 else: 982 logging.debug('Accepted hostnames: all') 983 984 if self.options.log_format_regex: 985 self.format = RegexFormat('custom', self.options.log_format_regex, self.options.log_date_format) 986 elif self.options.log_format_name: 987 try: 988 self.format = FORMATS[self.options.log_format_name] 989 except KeyError: 990 fatal_error('invalid log format: %s' % self.options.log_format_name) 991 else: 992 self.format = None 993 994 if not hasattr(self.options, 'custom_w3c_fields'): 995 self.options.custom_w3c_fields = {} 996 elif self.format is not None: 997 # validate custom field mappings 998 for dummy_custom_name, default_name in self.options.custom_w3c_fields.items(): 999 if default_name not in type(format).fields: 1000 fatal_error("custom W3C field mapping error: don't know how to parse and use the '%s' field" % default_name) 1001 return 1002 1003 if hasattr(self.options, 'w3c_field_regexes'): 1004 # make sure each custom w3c field regex has a named group 1005 for field_name, field_regex in self.options.w3c_field_regexes.items(): 1006 if '(?P<' not in field_regex: 1007 fatal_error("cannot find named group in custom w3c field regex '%s' for field '%s'" % (field_regex, field_name)) 1008 return 1009 1010 1011 if not (self.options.matomo_url.startswith('http://') or self.options.matomo_url.startswith('https://')): 1012 self.options.matomo_url = 'http://' + self.options.matomo_url 1013 logging.debug('Matomo Tracker API URL is: %s', self.options.matomo_url) 1014 1015 if not self.options.matomo_api_url: 1016 self.options.matomo_api_url = self.options.matomo_url 1017 1018 if not (self.options.matomo_api_url.startswith('http://') or self.options.matomo_api_url.startswith('https://')): 1019 self.options.matomo_api_url = 'http://' + self.options.matomo_api_url 1020 logging.debug('Matomo Analytics API URL is: %s', self.options.matomo_api_url) 1021 1022 if self.options.recorders < 1: 1023 self.options.recorders = 1 1024 1025 download_extensions = DOWNLOAD_EXTENSIONS 1026 if self.options.download_extensions: 1027 download_extensions = set(self.options.download_extensions.split(',')) 1028 1029 if self.options.extra_download_extensions: 1030 download_extensions.update(self.options.extra_download_extensions.split(',')) 1031 self.options.download_extensions = download_extensions 1032 1033 if self.options.regex_groups_to_ignore: 1034 self.options.regex_groups_to_ignore = set(self.options.regex_groups_to_ignore.split(',')) 1035 1036 def __init__(self, argv = None): 1037 self._parse_args(self._create_parser(), argv) 1038 1039 def _get_token_auth(self): 1040 """ 1041 If the token auth is not specified in the options, get it from Matomo. 1042 """ 1043 # Get superuser login/password from the options. 1044 logging.debug('No token-auth specified') 1045 1046 if self.options.login and self.options.password: 1047 matomo_login = self.options.login 1048 matomo_password = self.options.password 1049 1050 logging.debug('Using credentials: (login = %s, using password = %s)', matomo_login, 'YES' if matomo_password else 'NO') 1051 try: 1052 api_result = matomo.call_api('UsersManager.createAppSpecificTokenAuth', 1053 userLogin=matomo_login, 1054 passwordConfirmation=matomo_password, 1055 description='Log importer', 1056 expireHours='48', 1057 _token_auth='', 1058 _url=self.options.matomo_api_url, 1059 ) 1060 except urllib.error.URLError as e: 1061 fatal_error('error when fetching token_auth from the API: %s' % e) 1062 1063 try: 1064 return api_result['value'] 1065 except KeyError: 1066 # Happens when the credentials are invalid. 1067 message = api_result.get('message') 1068 fatal_error( 1069 'error fetching authentication token token_auth%s' % ( 1070 ': %s' % message if message else '') 1071 ) 1072 else: 1073 # Fallback to the given (or default) configuration file, then 1074 # get the token from the API. 1075 logging.debug( 1076 'No credentials specified, reading them from "%s"', 1077 self.options.config_file, 1078 ) 1079 config_file = configparser.RawConfigParser(strict=False) 1080 success = len(config_file.read(self.options.config_file)) > 0 1081 if not success: 1082 fatal_error( 1083 "the configuration file" + self.options.config_file + " could not be read. Please check permission. This file must be readable by the user running this script to get the authentication token" 1084 ) 1085 1086 updatetokenfile = os.path.abspath( 1087 os.path.join(self.options.config_file, 1088 '../../misc/cron/updatetoken.php'), 1089 ) 1090 1091 phpBinary = config.options.php_binary 1092 1093 # Special handling for windows (only if given php binary does not differ from default) 1094 is_windows = sys.platform.startswith('win') 1095 if phpBinary == 'php' and is_windows: 1096 try: 1097 processWin = subprocess.Popen('where php.exe', stdout=subprocess.PIPE, stderr=subprocess.PIPE) 1098 [stdout, stderr] = processWin.communicate() 1099 if processWin.returncode == 0: 1100 phpBinary = stdout.strip() 1101 else: 1102 fatal_error("We couldn't detect PHP. It might help to add your php.exe to the path or alternatively run the importer using the --login and --password option") 1103 except: 1104 fatal_error("We couldn't detect PHP. You can run the importer using the --login and --password option to fix this issue") 1105 1106 command = [phpBinary, updatetokenfile] 1107 if self.options.enable_testmode: 1108 command.append('--testmode') 1109 1110 hostname = urllib.parse.urlparse( self.options.matomo_url ).hostname 1111 command.append('--matomo-domain=' + hostname ) 1112 1113 command = subprocess.list2cmdline(command) 1114 1115# logging.debug(command); 1116 1117 process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) 1118 [stdout, stderr] = process.communicate() 1119 stdout, stderr = stdout.decode(), stderr.decode() 1120 if process.returncode != 0: 1121 fatal_error("`" + command + "` failed with error: " + stderr + ".\nReponse code was: " + str(process.returncode) + ". You can alternatively run the importer using the --login and --password option") 1122 1123 filename = stdout 1124 credentials = open(filename, 'r').readline() 1125 credentials = credentials.split('\t') 1126 return credentials[1] 1127 1128 def get_resolver(self): 1129 if self.options.site_id: 1130 logging.debug('Resolver: static') 1131 return StaticResolver(self.options.site_id) 1132 else: 1133 logging.debug('Resolver: dynamic') 1134 return DynamicResolver() 1135 1136 def init_token_auth(self): 1137 if not self.options.matomo_token_auth: 1138 try: 1139 self.options.matomo_token_auth = self._get_token_auth() 1140 except MatomoHttpBase.Error as e: 1141 fatal_error(e) 1142 logging.debug('Authentication token token_auth is: %s', self.options.matomo_token_auth) 1143 1144 1145class Statistics: 1146 """ 1147 Store statistics about parsed logs and recorded entries. 1148 Can optionally print statistics on standard output every second. 1149 """ 1150 1151 class Counter: 1152 """ 1153 Simple integers cannot be used by multithreaded programs. See: 1154 https://stackoverflow.com/questions/6320107/are-python-ints-thread-safe 1155 """ 1156 def __init__(self): 1157 # itertools.count's implementation in C does not release the GIL and 1158 # therefore is thread-safe. 1159 self.counter = itertools.count(1) 1160 self.value = 0 1161 1162 def increment(self): 1163 self.value = next(self.counter) 1164 1165 def advance(self, n): 1166 for i in range(n): 1167 self.increment() 1168 1169 def __str__(self): 1170 return str(int(self.value)) 1171 1172 def __init__(self): 1173 self.time_start = None 1174 self.time_stop = None 1175 1176 self.matomo_sites = set() # sites ID 1177 self.matomo_sites_created = [] # (hostname, site ID) 1178 self.matomo_sites_ignored = set() # hostname 1179 1180 self.count_lines_parsed = self.Counter() 1181 self.count_lines_recorded = self.Counter() 1182 1183 # requests that the Matomo tracker considered invalid (or failed to track) 1184 self.invalid_lines = [] 1185 1186 # Do not match the regexp. 1187 self.count_lines_invalid = self.Counter() 1188 # Were filtered out. 1189 self.count_lines_filtered = self.Counter() 1190 # No site ID found by the resolver. 1191 self.count_lines_no_site = self.Counter() 1192 # Hostname filtered by config.options.hostnames 1193 self.count_lines_hostname_skipped = self.Counter() 1194 # Static files. 1195 self.count_lines_static = self.Counter() 1196 # Ignored user-agents. 1197 self.count_lines_skipped_user_agent = self.Counter() 1198 # Ignored HTTP errors. 1199 self.count_lines_skipped_http_errors = self.Counter() 1200 # Ignored HTTP redirects. 1201 self.count_lines_skipped_http_redirects = self.Counter() 1202 # Downloads 1203 self.count_lines_downloads = self.Counter() 1204 # Ignored downloads when --download-extensions is used 1205 self.count_lines_skipped_downloads = self.Counter() 1206 1207 # Misc 1208 self.dates_recorded = set() 1209 self.monitor_stop = False 1210 1211 def set_time_start(self): 1212 self.time_start = time.time() 1213 1214 def set_time_stop(self): 1215 self.time_stop = time.time() 1216 1217 def _compute_speed(self, value, start, end): 1218 delta_time = end - start 1219 if value == 0: 1220 return 0 1221 if delta_time == 0: 1222 return 'very high!' 1223 else: 1224 return value / delta_time 1225 1226 def _round_value(self, value, base=100): 1227 return round(value * base) / base 1228 1229 def _indent_text(self, lines, level=1): 1230 """ 1231 Return an indented text. 'lines' can be a list of lines or a single 1232 line (as a string). One level of indentation is 4 spaces. 1233 """ 1234 prefix = ' ' * (4 * level) 1235 if isinstance(lines, str): 1236 return prefix + lines 1237 else: 1238 return '\n'.join( 1239 prefix + line 1240 for line in lines 1241 ) 1242 1243 def print_summary(self): 1244 invalid_lines_summary = '' 1245 if self.invalid_lines: 1246 invalid_lines_summary = '''Invalid log lines 1247----------------- 1248 1249The following lines were not tracked by Matomo, either due to a malformed tracker request or error in the tracker: 1250 1251%s 1252 1253''' % textwrap.fill(", ".join(self.invalid_lines), 80) 1254 1255 print((''' 1256%(invalid_lines)sLogs import summary 1257------------------- 1258 1259 %(count_lines_recorded)d requests imported successfully 1260 %(count_lines_downloads)d requests were downloads 1261 %(total_lines_ignored)d requests ignored: 1262 %(count_lines_skipped_http_errors)d HTTP errors 1263 %(count_lines_skipped_http_redirects)d HTTP redirects 1264 %(count_lines_invalid)d invalid log lines 1265 %(count_lines_filtered)d filtered log lines 1266 %(count_lines_no_site)d requests did not match any known site 1267 %(count_lines_hostname_skipped)d requests did not match any --hostname 1268 %(count_lines_skipped_user_agent)d requests done by bots, search engines... 1269 %(count_lines_static)d requests to static resources (css, js, images, ico, ttf...) 1270 %(count_lines_skipped_downloads)d requests to file downloads did not match any --download-extensions 1271 1272Website import summary 1273---------------------- 1274 1275 %(count_lines_recorded)d requests imported to %(total_sites)d sites 1276 %(total_sites_existing)d sites already existed 1277 %(total_sites_created)d sites were created: 1278%(sites_created)s 1279 %(total_sites_ignored)d distinct hostnames did not match any existing site: 1280%(sites_ignored)s 1281%(sites_ignored_tips)s 1282 1283Performance summary 1284------------------- 1285 1286 Total time: %(total_time)d seconds 1287 Requests imported per second: %(speed_recording)s requests per second 1288 1289Processing your log data 1290------------------------ 1291 1292 In order for your logs to be processed by Matomo, you may need to run the following command: 1293 ./console core:archive --force-all-websites --url='%(url)s' 1294''' % { 1295 1296 'count_lines_recorded': self.count_lines_recorded.value, 1297 'count_lines_downloads': self.count_lines_downloads.value, 1298 'total_lines_ignored': sum([ 1299 self.count_lines_invalid.value, 1300 self.count_lines_filtered.value, 1301 self.count_lines_skipped_user_agent.value, 1302 self.count_lines_skipped_http_errors.value, 1303 self.count_lines_skipped_http_redirects.value, 1304 self.count_lines_static.value, 1305 self.count_lines_skipped_downloads.value, 1306 self.count_lines_no_site.value, 1307 self.count_lines_hostname_skipped.value, 1308 ]), 1309 'count_lines_invalid': self.count_lines_invalid.value, 1310 'count_lines_filtered': self.count_lines_filtered.value, 1311 'count_lines_skipped_user_agent': self.count_lines_skipped_user_agent.value, 1312 'count_lines_skipped_http_errors': self.count_lines_skipped_http_errors.value, 1313 'count_lines_skipped_http_redirects': self.count_lines_skipped_http_redirects.value, 1314 'count_lines_static': self.count_lines_static.value, 1315 'count_lines_skipped_downloads': self.count_lines_skipped_downloads.value, 1316 'count_lines_no_site': self.count_lines_no_site.value, 1317 'count_lines_hostname_skipped': self.count_lines_hostname_skipped.value, 1318 'total_sites': len(self.matomo_sites), 1319 'total_sites_existing': len(self.matomo_sites - set(site_id for hostname, site_id in self.matomo_sites_created)), 1320 'total_sites_created': len(self.matomo_sites_created), 1321 'sites_created': self._indent_text( 1322 ['%s (ID: %d)' % (hostname, site_id) for hostname, site_id in self.matomo_sites_created], 1323 level=3, 1324 ), 1325 'total_sites_ignored': len(self.matomo_sites_ignored), 1326 'sites_ignored': self._indent_text( 1327 self.matomo_sites_ignored, level=3, 1328 ), 1329 'sites_ignored_tips': ''' 1330 TIPs: 1331 - if one of these hosts is an alias host for one of the websites 1332 in Matomo, you can add this host as an "Alias URL" in Settings > Websites. 1333 - use --add-sites-new-hosts if you wish to automatically create 1334 one website for each of these hosts in Matomo rather than discarding 1335 these requests. 1336 - use --idsite-fallback to force all these log lines with a new hostname 1337 to be recorded in a specific idsite (for example for troubleshooting/visualizing the data) 1338 - use --idsite to force all lines in the specified log files 1339 to be all recorded in the specified idsite 1340 - or you can also manually create a new Website in Matomo with the URL set to this hostname 1341''' if self.matomo_sites_ignored else '', 1342 'total_time': self.time_stop - self.time_start, 1343 'speed_recording': self._round_value(self._compute_speed( 1344 self.count_lines_recorded.value, 1345 self.time_start, self.time_stop, 1346 )), 1347 'url': config.options.matomo_api_url, 1348 'invalid_lines': invalid_lines_summary 1349})) 1350 1351 ## 1352 ## The monitor is a thread that prints a short summary each second. 1353 ## 1354 1355 def _monitor(self): 1356 latest_total_recorded = 0 1357 while not self.monitor_stop: 1358 current_total = stats.count_lines_recorded.value 1359 time_elapsed = time.time() - self.time_start 1360 print(('%d lines parsed, %d lines recorded, %d records/sec (avg), %d records/sec (current)' % ( 1361 stats.count_lines_parsed.value, 1362 current_total, 1363 current_total / time_elapsed if time_elapsed != 0 else 0, 1364 (current_total - latest_total_recorded) / config.options.show_progress_delay, 1365 ))) 1366 latest_total_recorded = current_total 1367 time.sleep(config.options.show_progress_delay) 1368 1369 def start_monitor(self): 1370 t = threading.Thread(target=self._monitor) 1371 t.daemon = True 1372 t.start() 1373 1374 def stop_monitor(self): 1375 self.monitor_stop = True 1376 1377class TimeHelper: 1378 1379 @staticmethod 1380 def timedelta_from_timezone(timezone): 1381 timezone = int(timezone) 1382 sign = 1 if timezone >= 0 else -1 1383 n = abs(timezone) 1384 1385 hours = int(n / 100) * sign 1386 minutes = n % 100 * sign 1387 1388 return datetime.timedelta(hours=hours, minutes=minutes) 1389 1390class UrlHelper: 1391 1392 @staticmethod 1393 def convert_array_args(args): 1394 """ 1395 Converts PHP deep query param arrays (eg, w/ names like hsr_ev[abc][0][]=value) into a nested list/dict 1396 structure that will convert correctly to JSON. 1397 """ 1398 1399 final_args = collections.OrderedDict() 1400 for key, value in args.items(): 1401 indices = key.split('[') 1402 if '[' in key: 1403 # contains list of all indices, eg for abc[def][ghi][] = 123, indices would be ['abc', 'def', 'ghi', ''] 1404 indices = [i.rstrip(']') for i in indices] 1405 1406 # navigate the multidimensional array final_args, creating lists/dicts when needed, using indices 1407 element = final_args 1408 for i in range(0, len(indices) - 1): 1409 idx = indices[i] 1410 1411 # if there's no next key, then this element is a list, otherwise a dict 1412 element_type = list if not indices[i + 1] else dict 1413 if idx not in element or not isinstance(element[idx], element_type): 1414 element[idx] = element_type() 1415 1416 element = element[idx] 1417 1418 # set the value in the final container we navigated to 1419 if not indices[-1]: # last indice is '[]' 1420 element.append(value) 1421 else: # last indice has a key, eg, '[abc]' 1422 element[indices[-1]] = value 1423 else: 1424 final_args[key] = value 1425 1426 return UrlHelper._convert_dicts_to_arrays(final_args) 1427 1428 @staticmethod 1429 def _convert_dicts_to_arrays(d): 1430 # convert dicts that have contiguous integer keys to arrays 1431 for key, value in d.items(): 1432 if not isinstance(value, dict): 1433 continue 1434 1435 if UrlHelper._has_contiguous_int_keys(value): 1436 d[key] = UrlHelper._convert_dict_to_array(value) 1437 else: 1438 d[key] = UrlHelper._convert_dicts_to_arrays(value) 1439 1440 return d 1441 1442 @staticmethod 1443 def _has_contiguous_int_keys(d): 1444 for i in range(0, len(d)): 1445 if str(i) not in d: 1446 return False 1447 return True 1448 1449 @staticmethod 1450 def _convert_dict_to_array(d): 1451 result = [] 1452 for i in range(0, len(d)): 1453 result.append(d[str(i)]) 1454 return result 1455 1456class MatomoHttpBase: 1457 class Error(Exception): 1458 1459 def __init__(self, message, code = None): 1460 super(MatomoHttpBase.Error, self).__init__(message) 1461 1462 self.code = code 1463 1464 1465class MatomoHttpUrllib(MatomoHttpBase): 1466 """ 1467 Make requests to Matomo. 1468 """ 1469 1470 class RedirectHandlerWithLogging(urllib.request.HTTPRedirectHandler): 1471 """ 1472 Special implementation of HTTPRedirectHandler that logs redirects in debug mode 1473 to help users debug system issues. 1474 """ 1475 1476 def redirect_request(self, req, fp, code, msg, hdrs, newurl): 1477 logging.debug("Request redirected (code: %s) to '%s'" % (code, newurl)) 1478 1479 return urllib.request.HTTPRedirectHandler.redirect_request(self, req, fp, code, msg, hdrs, newurl) 1480 1481 def _call(self, path, args, headers=None, url=None, data=None): 1482 """ 1483 Make a request to the Matomo site. It is up to the caller to format 1484 arguments, to embed authentication, etc. 1485 """ 1486 if url is None: 1487 url = config.options.matomo_url 1488 headers = headers or {} 1489 1490 if data is None: 1491 # If Content-Type isn't defined, PHP do not parse the request's body. 1492 headers['Content-type'] = 'application/x-www-form-urlencoded' 1493 data = urllib.parse.urlencode(args) 1494 elif not isinstance(data, str) and headers['Content-type'] == 'application/json': 1495 data = json.dumps(data) 1496 1497 if args: 1498 path = path + '?' + urllib.parse.urlencode(args) 1499 1500 if config.options.request_suffix: 1501 path = path + ('&' if '?' in path else '?') + config.options.request_suffix 1502 1503 headers['User-Agent'] = 'Matomo/LogImport' 1504 1505 try: 1506 timeout = config.options.request_timeout 1507 except: 1508 timeout = None # the config global object may not be created at this point 1509 1510 request = urllib.request.Request(url + path, data.encode("utf-8"), headers) 1511 1512 # Handle basic auth if auth_user set 1513 try: 1514 auth_user = config.options.auth_user 1515 auth_password = config.options.auth_password 1516 except: 1517 auth_user = None 1518 auth_password = None 1519 1520 if auth_user is not None: 1521 base64string = base64.encodebytes('{}:{}'.format(auth_user, auth_password).encode()).decode().replace('\n', '') 1522 request.add_header("Authorization", "Basic %s" % base64string) 1523 1524 # Use non-default SSL context if invalid certificates shall be 1525 # accepted. 1526 if config.options.accept_invalid_ssl_certificate and \ 1527 sys.version_info >= (2, 7, 9): 1528 ssl_context = ssl.create_default_context() 1529 ssl_context.check_hostname = False 1530 ssl_context.verify_mode = ssl.CERT_NONE 1531 https_handler_args = {'context': ssl_context} 1532 else: 1533 https_handler_args = {} 1534 opener = urllib.request.build_opener( 1535 self.RedirectHandlerWithLogging(), 1536 urllib.request.HTTPSHandler(**https_handler_args)) 1537 response = opener.open(request, timeout = timeout) 1538 encoding = response.info().get_content_charset('utf-8') 1539 result = response.read() 1540 response.close() 1541 return result.decode(encoding) 1542 1543 def _call_api(self, method, **kwargs): 1544 """ 1545 Make a request to the Matomo API taking care of authentication, body 1546 formatting, etc. 1547 """ 1548 args = { 1549 'module' : 'API', 1550 'format' : 'json', 1551 'method' : method, 1552 'filter_limit' : '-1', 1553 } 1554 # token_auth, by default, is taken from config. 1555 token_auth = kwargs.pop('_token_auth', None) 1556 if token_auth is None: 1557 token_auth = config.options.matomo_token_auth 1558 if token_auth: 1559 args['token_auth'] = token_auth 1560 1561 url = kwargs.pop('_url', None) 1562 if url is None: 1563 url = config.options.matomo_api_url 1564 1565 1566 if kwargs: 1567 args.update(kwargs) 1568 1569 # Convert lists into appropriate format. 1570 # See: https://developer.matomo.org/api-reference/reporting-api#passing-an-array-of-data-as-a-parameter 1571 # Warning: we have to pass the parameters in order: foo[0], foo[1], foo[2] 1572 # and not foo[1], foo[0], foo[2] (it will break Matomo otherwise.) 1573 final_args = [] 1574 for key, value in args.items(): 1575 if isinstance(value, (list, tuple)): 1576 for index, obj in enumerate(value): 1577 final_args.append(('%s[%d]' % (key, index), obj)) 1578 else: 1579 final_args.append((key, value)) 1580 1581 1582# logging.debug('%s' % final_args) 1583# logging.debug('%s' % url) 1584 1585 res = self._call('/', final_args, url=url) 1586 1587 try: 1588 return json.loads(res) 1589 except ValueError: 1590 raise urllib.error.URLError('Matomo returned an invalid response: ' + res.decode("utf-8") ) 1591 1592 def _call_wrapper(self, func, expected_response, on_failure, *args, **kwargs): 1593 """ 1594 Try to make requests to Matomo at most MATOMO_FAILURE_MAX_RETRY times. 1595 """ 1596 errors = 0 1597 while True: 1598 try: 1599 response = func(*args, **kwargs) 1600 if expected_response is not None and response != expected_response: 1601 if on_failure is not None: 1602 error_message = on_failure(response, kwargs.get('data')) 1603 else: 1604 error_message = "didn't receive the expected response. Response was %s " % response 1605 1606 raise urllib.error.URLError(error_message) 1607 return response 1608 except (urllib.error.URLError, http.client.HTTPException, ValueError, socket.timeout) as e: 1609 logging.info('Error when connecting to Matomo: %s', e) 1610 1611 code = None 1612 if isinstance(e, urllib.error.HTTPError): 1613 # See Python issue 13211. 1614 message = 'HTTP Error %s %s' % (e.code, e.msg) 1615 code = e.code 1616 elif isinstance(e, urllib.error.URLError): 1617 message = e.reason 1618 else: 1619 message = str(e) 1620 1621 # decorate message w/ HTTP response, if it can be retrieved 1622 if hasattr(e, 'read'): 1623 message = message + ", response: " + e.read().decode() 1624 1625 try: 1626 delay_after_failure = config.options.delay_after_failure 1627 max_attempts = config.options.max_attempts 1628 except NameError: 1629 delay_after_failure = MATOMO_DEFAULT_DELAY_AFTER_FAILURE 1630 max_attempts = MATOMO_DEFAULT_MAX_ATTEMPTS 1631 1632 errors += 1 1633 if errors == max_attempts: 1634 logging.info("Max number of attempts reached, server is unreachable!") 1635 1636 raise MatomoHttpBase.Error(message, code) 1637 else: 1638 logging.info("Retrying request, attempt number %d" % (errors + 1)) 1639 1640 time.sleep(delay_after_failure) 1641 1642 def call(self, path, args, expected_content=None, headers=None, data=None, on_failure=None): 1643 return self._call_wrapper(self._call, expected_content, on_failure, path, args, headers, 1644 data=data) 1645 1646 def call_api(self, method, **kwargs): 1647 return self._call_wrapper(self._call_api, None, None, method, **kwargs) 1648 1649## 1650## Resolvers. 1651## 1652## A resolver is a class that turns a hostname into a Matomo site ID. 1653## 1654 1655class StaticResolver: 1656 """ 1657 Always return the same site ID, specified in the configuration. 1658 """ 1659 1660 def __init__(self, site_id): 1661 self.site_id = site_id 1662 # Go get the main URL 1663 site = matomo.call_api( 1664 'SitesManager.getSiteFromId', idSite=self.site_id 1665 ) 1666 if site.get('result') == 'error': 1667 fatal_error( 1668 "cannot get the main URL of this site: %s" % site.get('message') 1669 ) 1670 self._main_url = site['main_url'] 1671 stats.matomo_sites.add(self.site_id) 1672 1673 def resolve(self, hit): 1674 return (self.site_id, self._main_url) 1675 1676 def check_format(self, format): 1677 pass 1678 1679class DynamicResolver: 1680 """ 1681 Use Matomo API to determine the site ID. 1682 """ 1683 1684 _add_site_lock = threading.Lock() 1685 1686 def __init__(self): 1687 self._cache = {} 1688 if config.options.replay_tracking: 1689 # get existing sites 1690 self._cache['sites'] = matomo.call_api('SitesManager.getAllSites') 1691 1692 def _get_site_id_from_hit_host(self, hit): 1693 return matomo.call_api( 1694 'SitesManager.getSitesIdFromSiteUrl', 1695 url=hit.host, 1696 ) 1697 1698 def _add_site(self, hit): 1699 main_url = 'http://' + hit.host 1700 DynamicResolver._add_site_lock.acquire() 1701 1702 try: 1703 # After we obtain the lock, make sure the site hasn't already been created. 1704 res = self._get_site_id_from_hit_host(hit) 1705 if res: 1706 return res[0]['idsite'] 1707 1708 # The site doesn't exist. 1709 logging.debug('No Matomo site found for the hostname: %s', hit.host) 1710 if config.options.site_id_fallback is not None: 1711 logging.debug('Using default site for hostname: %s', hit.host) 1712 return config.options.site_id_fallback 1713 elif config.options.add_sites_new_hosts: 1714 if config.options.dry_run: 1715 # Let's just return a fake ID. 1716 return 0 1717 logging.debug('Creating a Matomo site for hostname %s', hit.host) 1718 result = matomo.call_api( 1719 'SitesManager.addSite', 1720 siteName=hit.host, 1721 urls=[main_url], 1722 ) 1723 if result.get('result') == 'error': 1724 logging.error("Couldn't create a Matomo site for host %s: %s", 1725 hit.host, result.get('message'), 1726 ) 1727 return None 1728 else: 1729 site_id = result['value'] 1730 stats.matomo_sites_created.append((hit.host, site_id)) 1731 return site_id 1732 else: 1733 # The site doesn't exist, we don't want to create new sites and 1734 # there's no default site ID. We thus have to ignore this hit. 1735 return None 1736 finally: 1737 DynamicResolver._add_site_lock.release() 1738 1739 def _resolve(self, hit): 1740 res = self._get_site_id_from_hit_host(hit) 1741 if res: 1742 # The site already exists. 1743 site_id = res[0]['idsite'] 1744 else: 1745 site_id = self._add_site(hit) 1746 if site_id is not None: 1747 stats.matomo_sites.add(site_id) 1748 return site_id 1749 1750 def _resolve_when_replay_tracking(self, hit): 1751 """ 1752 If parsed site ID found in the _cache['sites'] return site ID and main_url, 1753 otherwise return (None, None) tuple. 1754 """ 1755 site_id = hit.args['idsite'] 1756 if site_id in self._cache['sites']: 1757 stats.matomo_sites.add(site_id) 1758 return (site_id, self._cache['sites'][site_id]['main_url']) 1759 else: 1760 return (None, None) 1761 1762 def _resolve_by_host(self, hit): 1763 """ 1764 Returns the site ID and site URL for a hit based on the hostname. 1765 """ 1766 try: 1767 site_id = self._cache[hit.host] 1768 except KeyError: 1769 logging.debug( 1770 'Site ID for hostname %s not in cache', hit.host 1771 ) 1772 site_id = self._resolve(hit) 1773 logging.debug('Site ID for hostname %s: %s', hit.host, site_id) 1774 self._cache[hit.host] = site_id 1775 return (site_id, 'http://' + hit.host) 1776 1777 def resolve(self, hit): 1778 """ 1779 Return the site ID from the cache if found, otherwise call _resolve. 1780 If replay_tracking option is enabled, call _resolve_when_replay_tracking. 1781 """ 1782 if config.options.replay_tracking: 1783 # We only consider requests with piwik.php which don't need host to be imported 1784 return self._resolve_when_replay_tracking(hit) 1785 else: 1786 # Workaround for empty Host bug issue #126 1787 if hit.host.strip() == '': 1788 hit.host = 'no-hostname-found-in-log' 1789 return self._resolve_by_host(hit) 1790 1791 def check_format(self, format): 1792 if config.options.replay_tracking: 1793 pass 1794 elif format.regex is not None and 'host' not in format.regex.groupindex and not config.options.log_hostname: 1795 fatal_error( 1796 "the selected log format doesn't include the hostname: you must " 1797 "specify the Matomo site ID with the --idsite argument" 1798 ) 1799 1800class Recorder: 1801 """ 1802 A Recorder fetches hits from the Queue and inserts them into Matomo using 1803 the API. 1804 """ 1805 1806 recorders = [] 1807 1808 def __init__(self): 1809 self.queue = queue.Queue(maxsize=2) 1810 1811 # if bulk tracking disabled, make sure we can store hits outside of the Queue 1812 if not config.options.use_bulk_tracking: 1813 self.unrecorded_hits = [] 1814 1815 @classmethod 1816 def launch(cls, recorder_count): 1817 """ 1818 Launch a bunch of Recorder objects in a separate thread. 1819 """ 1820 for i in range(recorder_count): 1821 recorder = Recorder() 1822 cls.recorders.append(recorder) 1823 1824 run = recorder._run_bulk if config.options.use_bulk_tracking else recorder._run_single 1825 t = threading.Thread(target=run) 1826 1827 t.daemon = True 1828 t.start() 1829 logging.debug('Launched recorder') 1830 1831 @classmethod 1832 def add_hits(cls, all_hits): 1833 """ 1834 Add a set of hits to the recorders queue. 1835 """ 1836 # Organize hits so that one client IP will always use the same queue. 1837 # We have to do this so visits from the same IP will be added in the right order. 1838 hits_by_client = [[] for r in cls.recorders] 1839 for hit in all_hits: 1840 hits_by_client[hit.get_visitor_id_hash() % len(cls.recorders)].append(hit) 1841 1842 for i, recorder in enumerate(cls.recorders): 1843 recorder.queue.put(hits_by_client[i]) 1844 1845 @classmethod 1846 def wait_empty(cls): 1847 """ 1848 Wait until all recorders have an empty queue. 1849 """ 1850 for recorder in cls.recorders: 1851 recorder._wait_empty() 1852 1853 def _run_bulk(self): 1854 while True: 1855 try: 1856 hits = self.queue.get() 1857 except: 1858 # TODO: we should log something here, however when this happens, logging.etc will throw 1859 return 1860 1861 if len(hits) > 0: 1862 try: 1863 self._record_hits(hits) 1864 except MatomoHttpBase.Error as e: 1865 fatal_error(e, hits[0].filename, hits[0].lineno) # approximate location of error 1866 self.queue.task_done() 1867 1868 def _run_single(self): 1869 while True: 1870 if config.options.force_one_action_interval != False: 1871 time.sleep(config.options.force_one_action_interval) 1872 1873 if len(self.unrecorded_hits) > 0: 1874 hit = self.unrecorded_hits.pop(0) 1875 1876 try: 1877 self._record_hits([hit]) 1878 except MatomoHttpBase.Error as e: 1879 fatal_error(e, hit.filename, hit.lineno) 1880 else: 1881 self.unrecorded_hits = self.queue.get() 1882 self.queue.task_done() 1883 1884 def _wait_empty(self): 1885 """ 1886 Wait until the queue is empty. 1887 """ 1888 while True: 1889 if self.queue.empty(): 1890 # We still have to wait for the last queue item being processed 1891 # (queue.empty() returns True before queue.task_done() is 1892 # called). 1893 self.queue.join() 1894 return 1895 time.sleep(1) 1896 1897 def date_to_matomo(self, date): 1898 date, time = date.isoformat(sep=' ').split() 1899 return '%s %s' % (date, time.replace('-', ':')) 1900 1901 def _get_hit_args(self, hit): 1902 """ 1903 Returns the args used in tracking a hit, without the token_auth. 1904 """ 1905 site_id, main_url = resolver.resolve(hit) 1906 if site_id is None: 1907 # This hit doesn't match any known Matomo site. 1908 if config.options.replay_tracking: 1909 stats.matomo_sites_ignored.add('unrecognized site ID %s' % hit.args.get('idsite')) 1910 else: 1911 stats.matomo_sites_ignored.add(hit.host) 1912 stats.count_lines_no_site.increment() 1913 return 1914 1915 stats.dates_recorded.add(hit.date.date()) 1916 1917 path = hit.path 1918 if hit.query_string and not config.options.strip_query_string: 1919 path += config.options.query_string_delimiter + hit.query_string 1920 1921 # only prepend main url / host if it's a path 1922 url_prefix = self._get_host_with_protocol(hit.host, main_url) if hasattr(hit, 'host') else main_url 1923 url = (url_prefix if path.startswith('/') else '') + path[:1024] 1924 1925 # handle custom variables before generating args dict 1926 if config.options.enable_bots: 1927 if hit.is_robot: 1928 hit.add_visit_custom_var("Bot", hit.user_agent) 1929 else: 1930 hit.add_visit_custom_var("Not-Bot", hit.user_agent) 1931 1932 hit.add_page_custom_var("HTTP-code", hit.status) 1933 1934 args = { 1935 'rec': '1', 1936 'apiv': '1', 1937 'url': url, 1938 'urlref': hit.referrer[:1024], 1939 'cip': hit.ip, 1940 'cdt': self.date_to_matomo(hit.date), 1941 'idsite': site_id, 1942 'queuedtracking': '0', 1943 'dp': '0' if config.options.reverse_dns else '1', 1944 'ua': hit.user_agent 1945 } 1946 1947 if config.options.replay_tracking: 1948 # prevent request to be force recorded when option replay-tracking 1949 args['rec'] = '0' 1950 1951 # idsite is already determined by resolver 1952 if 'idsite' in hit.args: 1953 del hit.args['idsite'] 1954 1955 args.update(hit.args) 1956 1957 if hit.is_download: 1958 args['download'] = args['url'] 1959 1960 if config.options.enable_bots: 1961 args['bots'] = '1' 1962 1963 if hit.is_error or hit.is_redirect: 1964 args['action_name'] = '%s%sURL = %s%s' % ( 1965 hit.status, 1966 config.options.title_category_delimiter, 1967 urllib.parse.quote(args['url'], ''), 1968 ("%sFrom = %s" % ( 1969 config.options.title_category_delimiter, 1970 urllib.parse.quote(args['urlref'], '') 1971 ) if args['urlref'] != '' else '') 1972 ) 1973 1974 if hit.generation_time_milli > 0: 1975 args['pf_srv'] = int(hit.generation_time_milli) 1976 1977 if hit.event_category and hit.event_action: 1978 args['e_c'] = hit.event_category 1979 args['e_a'] = hit.event_action 1980 1981 if hit.event_name: 1982 args['e_n'] = hit.event_name 1983 1984 if hit.length: 1985 args['bw_bytes'] = hit.length 1986 1987 # convert custom variable args to JSON 1988 if 'cvar' in args and not isinstance(args['cvar'], str): 1989 args['cvar'] = json.dumps(args['cvar']) 1990 1991 if '_cvar' in args and not isinstance(args['_cvar'], str): 1992 args['_cvar'] = json.dumps(args['_cvar']) 1993 1994 return UrlHelper.convert_array_args(args) 1995 1996 def _get_host_with_protocol(self, host, main_url): 1997 if '://' not in host: 1998 parts = urllib.parse.urlparse(main_url) 1999 host = parts.scheme + '://' + host 2000 return host 2001 2002 def _record_hits(self, hits): 2003 """ 2004 Inserts several hits into Matomo. 2005 """ 2006 if not config.options.dry_run: 2007 data = { 2008 'token_auth': config.options.matomo_token_auth, 2009 'requests': [self._get_hit_args(hit) for hit in hits] 2010 } 2011 try: 2012 args = {} 2013 2014 if config.options.debug_tracker: 2015 args['debug'] = '1' 2016 2017 response = matomo.call( 2018 config.options.matomo_tracker_endpoint_path, args=args, 2019 expected_content=None, 2020 headers={'Content-type': 'application/json'}, 2021 data=data, 2022 on_failure=self._on_tracking_failure 2023 ) 2024 2025 if config.options.debug_tracker: 2026 logging.debug('tracker response:\n%s' % response) 2027 2028 # check for invalid requests 2029 try: 2030 response = json.loads(response) 2031 except: 2032 logging.info("bulk tracking returned invalid JSON") 2033 2034 # don't display the tracker response if we're debugging the tracker. 2035 # debug tracker output will always break the normal JSON output. 2036 if not config.options.debug_tracker: 2037 logging.info("tracker response:\n%s" % response) 2038 2039 response = {} 2040 2041 if ('invalid_indices' in response and isinstance(response['invalid_indices'], list) and 2042 response['invalid_indices']): 2043 invalid_count = len(response['invalid_indices']) 2044 2045 invalid_lines = [str(hits[index].lineno) for index in response['invalid_indices']] 2046 invalid_lines_str = ", ".join(invalid_lines) 2047 2048 stats.invalid_lines.extend(invalid_lines) 2049 2050 logging.info("The Matomo tracker identified %s invalid requests on lines: %s" % (invalid_count, invalid_lines_str)) 2051 elif 'invalid' in response and response['invalid'] > 0: 2052 logging.info("The Matomo tracker identified %s invalid requests." % response['invalid']) 2053 except MatomoHttpBase.Error as e: 2054 # if the server returned 400 code, BulkTracking may not be enabled 2055 if e.code == 400: 2056 fatal_error("Server returned status 400 (Bad Request).\nIs the BulkTracking plugin disabled?", hits[0].filename, hits[0].lineno) 2057 2058 raise 2059 2060 stats.count_lines_recorded.advance(len(hits)) 2061 2062 def _is_json(self, result): 2063 try: 2064 json.loads(result) 2065 return True 2066 except ValueError: 2067 return False 2068 2069 def _on_tracking_failure(self, response, data): 2070 """ 2071 Removes the successfully tracked hits from the request payload so 2072 they are not logged twice. 2073 """ 2074 try: 2075 response = json.loads(response) 2076 except: 2077 # the response should be in JSON, but in case it can't be parsed just try another attempt 2078 logging.debug("cannot parse tracker response, should be valid JSON") 2079 return response 2080 2081 # remove the successfully tracked hits from payload 2082 tracked = response['tracked'] 2083 data['requests'] = data['requests'][tracked:] 2084 2085 return response['message'] 2086 2087class Hit: 2088 """ 2089 It's a simple container. 2090 """ 2091 def __init__(self, **kwargs): 2092 for key, value in kwargs.items(): 2093 setattr(self, key, value) 2094 super(Hit, self).__init__() 2095 2096 if config.options.force_lowercase_path: 2097 self.full_path = self.full_path.lower() 2098 2099 def get_visitor_id_hash(self): 2100 visitor_id = self.ip 2101 2102 if config.options.replay_tracking: 2103 for param_name_to_use in ['uid', 'cid', '_id', 'cip']: 2104 if param_name_to_use in self.args: 2105 visitor_id = self.args[param_name_to_use] 2106 break 2107 2108 return abs(hash(visitor_id)) 2109 2110 def add_page_custom_var(self, key, value): 2111 """ 2112 Adds a page custom variable to this Hit. 2113 """ 2114 self._add_custom_var(key, value, 'cvar') 2115 2116 def add_visit_custom_var(self, key, value): 2117 """ 2118 Adds a visit custom variable to this Hit. 2119 """ 2120 self._add_custom_var(key, value, '_cvar') 2121 2122 def _add_custom_var(self, key, value, api_arg_name): 2123 if api_arg_name not in self.args: 2124 self.args[api_arg_name] = {} 2125 2126 if isinstance(self.args[api_arg_name], str): 2127 logging.debug("Ignoring custom %s variable addition [ %s = %s ], custom var already set to string." % (api_arg_name, key, value)) 2128 return 2129 2130 index = len(self.args[api_arg_name]) + 1 2131 self.args[api_arg_name][index] = [key, value] 2132 2133class Parser: 2134 """ 2135 The Parser parses the lines in a specified file and inserts them into 2136 a Queue. 2137 """ 2138 2139 def __init__(self): 2140 self.check_methods = [method for name, method 2141 in inspect.getmembers(self, predicate=inspect.ismethod) 2142 if name.startswith('check_')] 2143 2144 ## All check_* methods are called for each hit and must return True if the 2145 ## hit can be imported, False otherwise. 2146 2147 def check_hostname(self, hit): 2148 # Check against config.hostnames. 2149 if not hasattr(hit, 'host') or not config.options.hostnames: 2150 return True 2151 2152 # Accept the hostname only if it matches one pattern in the list. 2153 result = any( 2154 fnmatch.fnmatch(hit.host, pattern) 2155 for pattern in config.options.hostnames 2156 ) 2157 if not result: 2158 stats.count_lines_hostname_skipped.increment() 2159 return result 2160 2161 def check_static(self, hit): 2162 filename = hit.path.split('/')[-1] 2163 2164 if hit.extension in STATIC_EXTENSIONS or filename in STATIC_FILES: 2165 if config.options.enable_static: 2166 hit.is_download = True 2167 return True 2168 else: 2169 stats.count_lines_static.increment() 2170 return False 2171 return True 2172 2173 def check_download(self, hit): 2174 if hit.extension in config.options.download_extensions: 2175 stats.count_lines_downloads.increment() 2176 hit.is_download = True 2177 return True 2178 # the file is not in the white-listed downloads 2179 # if it's a know download file, we shall skip it 2180 elif hit.extension in DOWNLOAD_EXTENSIONS: 2181 stats.count_lines_skipped_downloads.increment() 2182 return False 2183 return True 2184 2185 def check_user_agent(self, hit): 2186 user_agent = hit.user_agent.lower() 2187 for s in itertools.chain(EXCLUDED_USER_AGENTS, config.options.excluded_useragents): 2188 if s in user_agent: 2189 if config.options.enable_bots: 2190 hit.is_robot = True 2191 return True 2192 else: 2193 stats.count_lines_skipped_user_agent.increment() 2194 return False 2195 return True 2196 2197 def check_http_error(self, hit): 2198 if hit.status[0] in ('4', '5'): 2199 if config.options.replay_tracking: 2200 # process error logs for replay tracking, since we don't care if matomo error-ed the first time 2201 return True 2202 elif config.options.enable_http_errors: 2203 hit.is_error = True 2204 return True 2205 else: 2206 stats.count_lines_skipped_http_errors.increment() 2207 return False 2208 return True 2209 2210 def check_http_redirect(self, hit): 2211 if hit.status[0] == '3' and hit.status != '304': 2212 if config.options.enable_http_redirects: 2213 hit.is_redirect = True 2214 return True 2215 else: 2216 stats.count_lines_skipped_http_redirects.increment() 2217 return False 2218 return True 2219 2220 def check_path(self, hit): 2221 for excluded_path in config.options.excluded_paths: 2222 if fnmatch.fnmatch(hit.path, excluded_path): 2223 return False 2224 # By default, all paths are included. 2225 if config.options.included_paths: 2226 for included_path in config.options.included_paths: 2227 if fnmatch.fnmatch(hit.path, included_path): 2228 return True 2229 return False 2230 return True 2231 2232 @staticmethod 2233 def check_format(lineOrFile): 2234 format = False 2235 format_groups = 0 2236 for name, candidate_format in FORMATS.items(): 2237 logging.debug("Check format %s", name) 2238 2239 # skip auto detection for formats that can't be detected automatically 2240 if name == 'ovh': 2241 continue 2242 2243 match = None 2244 try: 2245 if isinstance(lineOrFile, str): 2246 match = candidate_format.check_format_line(lineOrFile) 2247 else: 2248 match = candidate_format.check_format(lineOrFile) 2249 except Exception: 2250 logging.debug('Error in format checking: %s', traceback.format_exc()) 2251 pass 2252 2253 if match: 2254 logging.debug('Format %s matches', name) 2255 2256 # compare format groups if this *BaseFormat has groups() method 2257 try: 2258 # if there's more info in this match, use this format 2259 match_groups = len(match.groups()) 2260 2261 logging.debug('Format match contains %d groups' % match_groups) 2262 2263 if format_groups < match_groups: 2264 format = candidate_format 2265 format_groups = match_groups 2266 except AttributeError: 2267 format = candidate_format 2268 2269 else: 2270 logging.debug('Format %s does not match', name) 2271 2272 # if the format is W3cExtendedFormat, check if the logs are from IIS and if so, issue a warning if the 2273 # --w3c-time-taken-milli option isn't set 2274 if isinstance(format, W3cExtendedFormat): 2275 format.check_for_iis_option() 2276 2277 return format 2278 2279 @staticmethod 2280 def detect_format(file): 2281 """ 2282 Return the best matching format for this file, or None if none was found. 2283 """ 2284 logging.debug('Detecting the log format') 2285 2286 format = False 2287 2288 # check the format using the file (for formats like the W3cExtendedFormat one) 2289 format = Parser.check_format(file) 2290 2291 # check the format using the first N lines (to avoid irregular ones) 2292 lineno = 0 2293 limit = 100000 2294 while not format and lineno < limit: 2295 line = file.readline() 2296 if not line: # if at eof, don't keep looping 2297 break 2298 2299 lineno = lineno + 1 2300 2301 logging.debug("Detecting format against line %i" % lineno) 2302 format = Parser.check_format(line) 2303 2304 try: 2305 file.seek(0) 2306 except IOError: 2307 pass 2308 2309 if not format: 2310 fatal_error("cannot automatically determine the log format using the first %d lines of the log file. " % limit + 2311 "\nMaybe try specifying the format with the --log-format-name command line argument." ) 2312 return 2313 2314 logging.debug('Format %s is the best match', format.name) 2315 return format 2316 2317 def is_filtered(self, hit): 2318 host = None 2319 if hasattr(hit, 'host'): 2320 host = hit.host 2321 else: 2322 try: 2323 host = urllib.parse.urlparse(hit.path).hostname 2324 except: 2325 pass 2326 2327 if host: 2328 if config.options.exclude_host and len(config.options.exclude_host) > 0 and host in config.options.exclude_host: 2329 return (True, 'host matched --exclude-host') 2330 2331 if config.options.include_host and len(config.options.include_host) > 0 and host not in config.options.include_host: 2332 return (True, 'host did not match --include-host') 2333 2334 if config.options.exclude_older_than and hit.date < config.options.exclude_older_than: 2335 return (True, 'date is older than --exclude-older-than') 2336 2337 if config.options.exclude_newer_than and hit.date > config.options.exclude_newer_than: 2338 return (True, 'date is newer than --exclude-newer-than') 2339 2340 return (False, None) 2341 2342 def parse(self, filename): 2343 """ 2344 Parse the specified filename and insert hits in the queue. 2345 """ 2346 def invalid_line(line, reason): 2347 stats.count_lines_invalid.increment() 2348 if config.options.debug >= 2: 2349 logging.debug('Invalid line detected (%s): %s' % (reason, line)) 2350 2351 def filtered_line(line, reason): 2352 stats.count_lines_filtered.increment() 2353 if config.options.debug >= 2: 2354 logging.debug('Filtered line out (%s): %s' % (reason, line)) 2355 2356 if filename == '-': 2357 filename = '(stdin)' 2358 file = sys.stdin 2359 else: 2360 if not os.path.exists(filename): 2361 print("\n=====> Warning: File %s does not exist <=====" % filename, file=sys.stderr) 2362 return 2363 else: 2364 if filename.endswith('.bz2'): 2365 open_func = bz2.open 2366 elif filename.endswith('.gz'): 2367 open_func = gzip.open 2368 else: 2369 open_func = open 2370 2371 file = open_func(filename, mode='rt', encoding=config.options.encoding, errors="surrogateescape") 2372 2373 if config.options.show_progress: 2374 print(('Parsing log %s...' % filename)) 2375 2376 if config.format: 2377 # The format was explicitly specified. 2378 format = config.format 2379 2380 if isinstance(format, W3cExtendedFormat): 2381 format.create_regex(file) 2382 2383 if format.regex is None: 2384 return fatal_error( 2385 "File is not in the correct format, is there a '#Fields:' line? " 2386 "If not, use the --w3c-fields option." 2387 ) 2388 else: 2389 # If the file is empty, don't bother. 2390 data = file.read(100) 2391 if len(data.strip()) == 0: 2392 return 2393 try: 2394 file.seek(0) 2395 except IOError: 2396 pass 2397 2398 format = self.detect_format(file) 2399 if format is None: 2400 return fatal_error( 2401 'Cannot guess the logs format. Please give one using ' 2402 'either the --log-format-name or --log-format-regex option' 2403 ) 2404 # Make sure the format is compatible with the resolver. 2405 resolver.check_format(format) 2406 2407 if config.options.dump_log_regex: 2408 logging.info("Using format '%s'." % format.name) 2409 if format.regex: 2410 logging.info("Regex being used: %s" % format.regex.pattern) 2411 else: 2412 logging.info("Format %s does not use a regex to parse log lines." % format.name) 2413 logging.info("--dump-log-regex option used, aborting log import.") 2414 os._exit(0) 2415 2416 valid_lines_count = 0 2417 2418 hits = [] 2419 lineno = -1 2420 while True: 2421 line = file.readline() 2422 if not line: break 2423 lineno = lineno + 1 2424 2425 stats.count_lines_parsed.increment() 2426 if stats.count_lines_parsed.value <= config.options.skip: 2427 continue 2428 2429 match = format.match(line) 2430 if not match: 2431 invalid_line(line, 'line did not match') 2432 continue 2433 2434 valid_lines_count = valid_lines_count + 1 2435 if config.options.debug_request_limit and valid_lines_count >= config.options.debug_request_limit: 2436 if len(hits) > 0: 2437 Recorder.add_hits(hits) 2438 logging.info("Exceeded limit specified in --debug-request-limit, exiting.") 2439 return 2440 2441 hit = Hit( 2442 filename=filename, 2443 lineno=lineno, 2444 status=format.get('status'), 2445 full_path=format.get('path'), 2446 is_download=False, 2447 is_robot=False, 2448 is_error=False, 2449 is_redirect=False, 2450 args={}, 2451 ) 2452 2453 if config.options.regex_group_to_page_cvars_map: 2454 self._add_custom_vars_from_regex_groups(hit, format, config.options.regex_group_to_page_cvars_map, True) 2455 2456 if config.options.regex_group_to_visit_cvars_map: 2457 self._add_custom_vars_from_regex_groups(hit, format, config.options.regex_group_to_visit_cvars_map, False) 2458 2459 if config.options.regex_groups_to_ignore: 2460 format.remove_ignored_groups(config.options.regex_groups_to_ignore) 2461 2462 # Add http method page cvar 2463 try: 2464 httpmethod = format.get('method') 2465 if config.options.track_http_method and httpmethod != '-': 2466 hit.add_page_custom_var('HTTP-method', httpmethod) 2467 except: 2468 pass 2469 2470 try: 2471 hit.query_string = format.get('query_string') 2472 hit.path = hit.full_path 2473 except BaseFormatException: 2474 hit.path, _, hit.query_string = hit.full_path.partition(config.options.query_string_delimiter) 2475 2476 # W3cExtendedFormat detaults to - when there is no query string, but we want empty string 2477 if hit.query_string == '-': 2478 hit.query_string = '' 2479 2480 hit.extension = hit.path.rsplit('.')[-1].lower() 2481 2482 try: 2483 hit.referrer = format.get('referrer') 2484 2485 if hit.referrer.startswith('"'): 2486 hit.referrer = hit.referrer[1:-1] 2487 except BaseFormatException: 2488 hit.referrer = '' 2489 if hit.referrer == '-': 2490 hit.referrer = '' 2491 2492 try: 2493 hit.user_agent = format.get('user_agent') 2494 2495 # in case a format parser included enclosing quotes, remove them so they are not 2496 # sent to Matomo 2497 if hit.user_agent.startswith('"'): 2498 hit.user_agent = hit.user_agent[1:-1] 2499 except BaseFormatException: 2500 hit.user_agent = '' 2501 2502 hit.ip = format.get('ip') 2503 try: 2504 hit.length = int(format.get('length')) 2505 except (ValueError, BaseFormatException): 2506 # Some lines or formats don't have a length (e.g. 304 redirects, W3C logs) 2507 hit.length = 0 2508 2509 try: 2510 hit.generation_time_milli = float(format.get('generation_time_milli')) 2511 except (ValueError, BaseFormatException): 2512 try: 2513 hit.generation_time_milli = float(format.get('generation_time_micro')) / 1000 2514 except (ValueError, BaseFormatException): 2515 try: 2516 hit.generation_time_milli = float(format.get('generation_time_secs')) * 1000 2517 except (ValueError, BaseFormatException): 2518 hit.generation_time_milli = 0 2519 2520 if config.options.log_hostname: 2521 hit.host = config.options.log_hostname 2522 else: 2523 try: 2524 hit.host = format.get('host').lower().strip('.') 2525 2526 if hit.host.startswith('"'): 2527 hit.host = hit.host[1:-1] 2528 except BaseFormatException: 2529 # Some formats have no host. 2530 pass 2531 2532 # Add userid 2533 try: 2534 hit.userid = None 2535 2536 userid = format.get('userid') 2537 if userid != '-': 2538 hit.args['uid'] = hit.userid = userid 2539 except: 2540 pass 2541 2542 # add event info 2543 try: 2544 hit.event_category = hit.event_action = hit.event_name = None 2545 2546 hit.event_category = format.get('event_category') 2547 hit.event_action = format.get('event_action') 2548 2549 hit.event_name = format.get('event_name') 2550 if hit.event_name == '-': 2551 hit.event_name = None 2552 except: 2553 pass 2554 2555 # Check if the hit must be excluded. 2556 if not all((method(hit) for method in self.check_methods)): 2557 continue 2558 2559 # Parse date. 2560 # We parse it after calling check_methods as it's quite CPU hungry, and 2561 # we want to avoid that cost for excluded hits. 2562 date_string = format.get('date') 2563 try: 2564 hit.date = datetime.datetime.strptime(date_string, format.date_format) 2565 hit.date += datetime.timedelta(seconds = config.options.seconds_to_add_to_date) 2566 except ValueError as e: 2567 invalid_line(line, 'invalid date or invalid format: %s' % str(e)) 2568 continue 2569 2570 # Parse timezone and subtract its value from the date 2571 try: 2572 timezone = format.get('timezone').replace(':', '') 2573 if timezone: 2574 hit.date -= TimeHelper.timedelta_from_timezone(timezone) 2575 except BaseFormatException: 2576 pass 2577 except ValueError: 2578 invalid_line(line, 'invalid timezone') 2579 continue 2580 2581 if config.options.replay_tracking: 2582 # we need a query string and we only consider requests with piwik.php 2583 if not hit.query_string or not self.is_hit_for_tracker(hit): 2584 invalid_line(line, 'no query string, or ' + hit.path.lower() + ' does not end with piwik.php/matomo.php') 2585 continue 2586 2587 query_arguments = urllib.parse.parse_qs(hit.query_string) 2588 if not "idsite" in query_arguments: 2589 invalid_line(line, 'missing idsite') 2590 continue 2591 2592 hit.args.update((k, v.pop()) for k, v in query_arguments.items()) 2593 2594 if config.options.seconds_to_add_to_date: 2595 for param in ['_idts', '_viewts', '_ects', '_refts']: 2596 if param in hit.args: 2597 hit.args[param] = int(hit.args[param]) + config.options.seconds_to_add_to_date 2598 2599 (is_filtered, reason) = self.is_filtered(hit) 2600 if is_filtered: 2601 filtered_line(line, reason) 2602 continue 2603 2604 hits.append(hit) 2605 2606 if len(hits) >= config.options.recorder_max_payload_size * len(Recorder.recorders): 2607 Recorder.add_hits(hits) 2608 hits = [] 2609 2610 # add last chunk of hits 2611 if len(hits) > 0: 2612 Recorder.add_hits(hits) 2613 2614 def is_hit_for_tracker(self, hit): 2615 filesToCheck = ['piwik.php', 'matomo.php'] 2616 if config.options.replay_tracking_expected_tracker_file: 2617 filesToCheck = [config.options.replay_tracking_expected_tracker_file] 2618 2619 lowerPath = hit.path.lower() 2620 for file in filesToCheck: 2621 if lowerPath.endswith(file): 2622 return True 2623 return False 2624 2625 def _add_custom_vars_from_regex_groups(self, hit, format, groups, is_page_var): 2626 for group_name, custom_var_name in groups.items(): 2627 if group_name in format.get_all(): 2628 value = format.get(group_name) 2629 2630 # don't track the '-' empty placeholder value 2631 if value == '-': 2632 continue 2633 2634 if is_page_var: 2635 hit.add_page_custom_var(custom_var_name, value) 2636 else: 2637 hit.add_visit_custom_var(custom_var_name, value) 2638 2639def main(): 2640 """ 2641 Start the importing process. 2642 """ 2643 stats.set_time_start() 2644 2645 if config.options.show_progress: 2646 stats.start_monitor() 2647 2648 recorders = Recorder.launch(config.options.recorders) 2649 2650 try: 2651 for filename in config.filenames: 2652 parser.parse(filename) 2653 2654 Recorder.wait_empty() 2655 except KeyboardInterrupt: 2656 pass 2657 2658 stats.set_time_stop() 2659 2660 if config.options.show_progress: 2661 stats.stop_monitor() 2662 2663 stats.print_summary() 2664 2665def fatal_error(error, filename=None, lineno=None): 2666 print('Fatal error: %s' % error, file=sys.stderr) 2667 if filename and lineno is not None: 2668 print(( 2669 'You can restart the import of "%s" from the point it failed by ' 2670 'specifying --skip=%d on the command line.\n' % (filename, lineno) 2671 ), file=sys.stderr) 2672 os._exit(1) 2673 2674if __name__ == '__main__': 2675 try: 2676 config = Configuration() 2677 # The matomo object depends on the config object, so we have to create 2678 # it after creating the configuration. 2679 matomo = MatomoHttpUrllib() 2680 # The init_token_auth method may need the matomo option, so we must call 2681 # it after creating the matomo object. 2682 config.init_token_auth() 2683 stats = Statistics() 2684 resolver = config.get_resolver() 2685 parser = Parser() 2686 main() 2687 sys.exit(0) 2688 except KeyboardInterrupt: 2689 pass 2690