1#!/usr/local/bin/python
2# vim: et sw=4 ts=4:
3# -*- coding: utf-8 -*-
4#
5# Matomo - free/libre analytics platform
6#
7# @link https://matomo.org
8# @license https://www.gnu.org/licenses/gpl-3.0.html GPL v3 or later
9# @version $Id$
10#
11# For more info see: https://matomo.org/log-analytics/ and https://matomo.org/docs/log-analytics-tool-how-to/
12#
13# Requires Python 3.5, 3.6 or 3.7
14#
15from __future__ import print_function  # this is needed that python2 can run the script until the warning below
16
17import sys
18
19if sys.version_info[0] != 3:
20    print('The log importer currently does not support Python 2 any more.')
21    print('Please use Python 3.5, 3.6, 3.7 or 3.8')
22    sys.exit(1)
23
24import base64
25import bz2
26import configparser
27import codecs
28import datetime
29import fnmatch
30import gzip
31import hashlib
32import http.client
33import inspect
34import itertools
35import json
36import logging
37import argparse
38import os
39import os.path
40import queue
41import re
42import ssl
43import sys
44import threading
45import time
46import urllib.request, urllib.parse, urllib.error
47import urllib.request, urllib.error, urllib.parse
48import urllib.parse
49import subprocess
50import traceback
51import socket
52import textwrap
53import collections
54import glob
55import io
56
57# Avoid "got more than 100 headers" error
58http.client._MAXHEADERS = 1000
59
60##
61## Constants.
62##
63
64STATIC_EXTENSIONS = set((
65    'gif jpg jpeg png bmp ico svg svgz ttf otf eot woff woff2 class swf css js xml webp'
66).split())
67
68STATIC_FILES = set((
69    'robots.txt'
70).split())
71
72DOWNLOAD_EXTENSIONS = set((
73    '7z aac arc arj asf asx avi bin csv deb dmg doc docx exe flac flv gz gzip hqx '
74    'ibooks jar json mpg mp2 mp3 mp4 mpeg mov movie msi msp odb odf odg odp '
75    'ods odt ogg ogv pdf phps ppt pptx qt qtm ra ram rar rpm rtf sea sit tar tbz '
76    'bz2 tbz tgz torrent txt wav webm wma wmv wpd xls xlsx xml xsd z zip '
77    'azw3 epub mobi apk '
78    'md5 sig'
79).split())
80
81# If you want to add more bots, take a look at the Matomo Device Detector botlist:
82# https://github.com/matomo-org/device-detector/blob/master/regexes/bots.yml
83# user agents must be lowercase
84EXCLUDED_USER_AGENTS = (
85    'adsbot-google',
86    'ask jeeves',
87    'baidubot',
88    'bot-',
89    'bot/',
90    'ccooter/',
91    'crawl',
92    'curl',
93    'echoping',
94    'exabot',
95    'feed',
96    'googlebot',
97    'ia_archiver',
98    'java/',
99    'libwww',
100    'mediapartners-google',
101    'msnbot',
102    'netcraftsurvey',
103    'panopta',
104    'pingdom.com_bot_',
105    'robot',
106    'spider',
107    'surveybot',
108    'twiceler',
109    'voilabot',
110    'yahoo',
111    'yandex',
112    'zabbix',
113    'googlestackdrivermonitoring',
114)
115
116MATOMO_DEFAULT_MAX_ATTEMPTS = 3
117MATOMO_DEFAULT_DELAY_AFTER_FAILURE = 10
118DEFAULT_SOCKET_TIMEOUT = 300
119
120MATOMO_EXPECTED_IMAGE = base64.b64decode(
121    'R0lGODlhAQABAIAAAAAAAAAAACH5BAEAAAAALAAAAAABAAEAAAICRAEAOw=='
122)
123
124##
125## Formats.
126##
127
128class BaseFormatException(Exception): pass
129
130class BaseFormat:
131    def __init__(self, name):
132        self.name = name
133        self.regex = None
134        self.date_format = '%d/%b/%Y:%H:%M:%S'
135
136    def check_format(self, file):
137        line = file.readline()
138        try:
139            file.seek(0)
140        except IOError:
141            pass
142
143        return self.check_format_line(line)
144
145    def check_format_line(self, line):
146        return False
147
148class JsonFormat(BaseFormat):
149    def __init__(self, name):
150        super(JsonFormat, self).__init__(name)
151        self.json = None
152        self.date_format = '%Y-%m-%dT%H:%M:%S'
153
154    def check_format_line(self, line):
155        try:
156            self.json = json.loads(line)
157            return True
158        except:
159            return False
160
161    def match(self, line):
162        try:
163            # nginx outputs malformed JSON w/ hex escapes when confronted w/ non-UTF input. we have to
164            # workaround this by converting hex escapes in strings to unicode escapes. the conversion is naive,
165            # so it does not take into account the string's actual encoding (which we don't have access to).
166            line = line.replace('\\x', '\\u00')
167
168            self.json = json.loads(line)
169            return self
170        except:
171            self.json = None
172            return None
173
174    def get(self, key):
175        # Some ugly patchs ...
176        if key == 'generation_time_milli':
177            self.json[key] =  int(float(self.json[key]) * 1000)
178        # Patch date format ISO 8601
179        elif key == 'date':
180            tz = self.json[key][19:]
181            self.json['timezone'] = tz.replace(':', '')
182            self.json[key] = self.json[key][:19]
183
184        try:
185            return self.json[key]
186        except KeyError:
187            raise BaseFormatException()
188
189    def get_all(self,):
190        return self.json
191
192    def remove_ignored_groups(self, groups):
193        for group in groups:
194            del self.json[group]
195
196class RegexFormat(BaseFormat):
197
198    def __init__(self, name, regex, date_format=None):
199        super(RegexFormat, self).__init__(name)
200        if regex is not None:
201            self.regex = re.compile(regex)
202        if date_format is not None:
203            self.date_format = date_format
204        self.matched = None
205
206    def check_format_line(self, line):
207        return self.match(line)
208
209    def match(self,line):
210        if not self.regex:
211            return None
212        match_result = self.regex.match(line)
213        if match_result:
214            self.matched = match_result.groupdict()
215            if 'time' in self.matched:
216                self.matched['date'] = self.matched['date'] + ' ' + self.matched['time']
217                del self.matched['time']
218        else:
219            self.matched = None
220        return match_result
221
222    def get(self, key):
223        try:
224            return self.matched[key]
225        except KeyError:
226            raise BaseFormatException("Cannot find group '%s'." % key)
227
228    def get_all(self,):
229        return self.matched
230
231    def remove_ignored_groups(self, groups):
232        for group in groups:
233            del self.matched[group]
234
235class W3cExtendedFormat(RegexFormat):
236
237    FIELDS_LINE_PREFIX = '#Fields: '
238    REGEX_UNKNOWN_FIELD = r'(?:".*?"|\S+)'
239
240    fields = {
241        'date': r'"?(?P<date>\d+[-\d+]+)"?',
242        'time': r'"?(?P<time>[\d+:]+)[.\d]*?"?',
243        'cs-uri-stem': r'(?P<path>/\S*)',
244        'cs-uri-query': r'(?P<query_string>\S*)',
245        'c-ip': r'"?(?P<ip>[\w*.:-]*)"?',
246        'cs(User-Agent)': r'(?P<user_agent>".*?"|\S*)',
247        'cs(Referer)': r'(?P<referrer>\S+)',
248        'sc-status': r'(?P<status>\d+)',
249        'sc-bytes': r'(?P<length>\S+)',
250        'cs-host': r'(?P<host>\S+)',
251        'cs-method': r'(?P<method>\S+)',
252        'cs-username': r'(?P<userid>\S+)',
253        'time-taken': r'(?P<generation_time_secs>[.\d]+)'
254    }
255
256    def __init__(self):
257        super(W3cExtendedFormat, self).__init__('w3c_extended', None, '%Y-%m-%d %H:%M:%S')
258
259    def check_format(self, file):
260        try:
261            file.seek(0)
262        except IOError:
263            pass
264
265        self.create_regex(file)
266
267        # if we couldn't create a regex, this file does not follow the W3C extended log file format
268        if not self.regex:
269            try:
270                file.seek(0)
271            except IOError:
272                pass
273
274            return
275
276        first_line = file.readline()
277
278        try:
279            file.seek(0)
280        except IOError:
281            pass
282
283        return self.check_format_line(first_line)
284
285    def create_regex(self, file):
286        fields_line = None
287        if config.options.w3c_fields:
288            fields_line = config.options.w3c_fields
289
290        # collect all header lines up until the Fields: line
291        # if we're reading from stdin, we can't seek, so don't read any more than the Fields line
292        header_lines = []
293        while fields_line is None:
294            line = file.readline().strip()
295
296            if not line:
297                continue
298
299            if not line.startswith('#'):
300                break
301
302            if line.startswith(self.FIELDS_LINE_PREFIX):
303                fields_line = line
304            else:
305                header_lines.append(line)
306
307        if not fields_line:
308            return
309
310        # store the header lines for a later check for IIS
311        self.header_lines = header_lines
312
313        # Parse the 'Fields: ' line to create the regex to use
314        full_regex = []
315
316        expected_fields = type(self).fields.copy() # turn custom field mapping into field => regex mapping
317
318        # if the --w3c-time-taken-millisecs option is used, make sure the time-taken field is interpreted as milliseconds
319        if config.options.w3c_time_taken_in_millisecs:
320            expected_fields['time-taken'] = r'(?P<generation_time_milli>[\d.]+)'
321
322        for mapped_field_name, field_name in config.options.custom_w3c_fields.items():
323            expected_fields[mapped_field_name] = expected_fields[field_name]
324            del expected_fields[field_name]
325
326        # add custom field regexes supplied through --w3c-field-regex option
327        for field_name, field_regex in config.options.w3c_field_regexes.items():
328            expected_fields[field_name] = field_regex
329
330        # Skip the 'Fields: ' prefix.
331        fields_line = fields_line[9:].strip()
332        for field in re.split(r'\s+', fields_line):
333            try:
334                regex = expected_fields[field]
335            except KeyError:
336                regex = self.REGEX_UNKNOWN_FIELD
337            full_regex.append(regex)
338        full_regex = r'\s+'.join(full_regex)
339
340        logging.debug("Based on 'Fields:' line, computed regex to be %s", full_regex)
341
342        self.regex = re.compile(full_regex)
343
344    def check_for_iis_option(self):
345        if not config.options.w3c_time_taken_in_millisecs and self._is_time_taken_milli() and self._is_iis():
346            logging.info("WARNING: IIS log file being parsed without --w3c-time-taken-milli option. IIS"
347                         " stores millisecond values in the time-taken field. If your logfile does this, the aforementioned"
348                         " option must be used in order to get accurate generation times.")
349
350    def _is_iis(self):
351        return len([line for line in self.header_lines if 'internet information services' in line.lower() or 'iis' in line.lower()]) > 0
352
353    def _is_time_taken_milli(self):
354        return 'generation_time_milli' not in self.regex.pattern
355
356class IisFormat(W3cExtendedFormat):
357
358    fields = W3cExtendedFormat.fields.copy()
359    fields.update({
360        'time-taken': r'(?P<generation_time_milli>[.\d]+)',
361        'sc-win32-status': r'(?P<__win32_status>\S+)' # this group is useless for log importing, but capturing it
362                                                     # will ensure we always select IIS for the format instead of
363                                                     # W3C logs when detecting the format. This way there will be
364                                                     # less accidental importing of IIS logs w/o --w3c-time-taken-milli.
365    })
366
367    def __init__(self):
368        super(IisFormat, self).__init__()
369
370        self.name = 'iis'
371
372class IncapsulaW3CFormat(W3cExtendedFormat):
373
374    # use custom unknown field regex to make resulting regex much simpler
375    REGEX_UNKNOWN_FIELD = r'".*?"'
376
377    fields = W3cExtendedFormat.fields.copy()
378    # redefines all fields as they are always encapsulated with "
379    fields.update({
380        'cs-uri': r'"(?P<host>[^\/\s]+)(?P<path>\S+)"',
381        'cs-uri-query': r'"(?P<query_string>\S*)"',
382        'c-ip': r'"(?P<ip>[\w*.:-]*)"',
383        'cs(User-Agent)': r'"(?P<user_agent>.*?)"',
384        'cs(Referer)': r'"(?P<referrer>\S+)"',
385        'sc-status': r'(?P<status>"\d*")',
386        'cs-bytes': r'(?P<length>"\d*")',
387    })
388
389    def __init__(self):
390        super(IncapsulaW3CFormat, self).__init__()
391
392        self.name = 'incapsula_w3c'
393
394    def get(self, key):
395        value = super(IncapsulaW3CFormat, self).get(key)
396        if key == 'status' or key == 'length':
397            value = value.strip('"')
398        if key == 'status' and value == '':
399            value = '200'
400        return value
401
402class ShoutcastFormat(W3cExtendedFormat):
403
404    fields = W3cExtendedFormat.fields.copy()
405    fields.update({
406        'c-status': r'(?P<status>\d+)',
407        'x-duration': r'(?P<generation_time_secs>[.\d]+)'
408    })
409
410    def __init__(self):
411        super(ShoutcastFormat, self).__init__()
412
413        self.name = 'shoutcast'
414
415    def get(self, key):
416        if key == 'user_agent':
417            user_agent = super(ShoutcastFormat, self).get(key)
418            return urllib.parse.unquote(user_agent)
419        else:
420            return super(ShoutcastFormat, self).get(key)
421
422class AmazonCloudFrontFormat(W3cExtendedFormat):
423
424    fields = W3cExtendedFormat.fields.copy()
425    fields.update({
426        'x-event': r'(?P<event_action>\S+)',
427        'x-sname': r'(?P<event_name>\S+)',
428        'cs-uri-stem': r'(?:rtmp:/)?(?P<path>/\S*)',
429        'c-user-agent': r'(?P<user_agent>".*?"|\S+)',
430
431        # following are present to match cloudfront instead of W3C when we know it's cloudfront
432        'x-edge-location': r'(?P<x_edge_location>".*?"|\S+)',
433        'x-edge-result-type': r'(?P<x_edge_result_type>".*?"|\S+)',
434        'x-edge-request-id': r'(?P<x_edge_request_id>".*?"|\S+)',
435        'x-host-header': r'(?P<host>".*?"|\S+)'
436    })
437
438    def __init__(self):
439        super(AmazonCloudFrontFormat, self).__init__()
440
441        self.name = 'amazon_cloudfront'
442
443    def get(self, key):
444        if key == 'event_category' and 'event_category' not in self.matched:
445            return 'cloudfront_rtmp'
446        elif key == 'status' and 'status' not in self.matched:
447            return '200'
448        elif key == 'user_agent':
449            user_agent = super(AmazonCloudFrontFormat, self).get(key)
450            return urllib.parse.unquote(urllib.parse.unquote(user_agent))  # Value is double quoted!
451        else:
452            return super(AmazonCloudFrontFormat, self).get(key)
453
454_HOST_PREFIX = r'(?P<host>[\w\-\.]*)(?::\d+)?\s+'
455
456_COMMON_LOG_FORMAT = (
457    r'(?P<ip>[\w*.:-]+)\s+\S+\s+(?P<userid>\S+)\s+\[(?P<date>.*?)\s+(?P<timezone>.*?)\]\s+'
458    r'"(?P<method>\S+)\s+(?P<path>.*?)\s+\S+"\s+(?P<status>\d+)\s+(?P<length>\S+)'
459)
460_NCSA_EXTENDED_LOG_FORMAT = (_COMMON_LOG_FORMAT +
461    r'\s+"(?P<referrer>.*?)"\s+"(?P<user_agent>.*?)"'
462)
463
464
465_S3_LOG_FORMAT = (
466    r'\S+\s+(?P<host>\S+)\s+\[(?P<date>.*?)\s+(?P<timezone>.*?)\]\s+(?P<ip>[\w*.:-]+)\s+'
467    r'(?P<userid>\S+)\s+\S+\s+\S+\s+\S+\s+"(?P<method>\S+)\s+(?P<path>.*?)\s+\S+"\s+(?P<status>\d+)\s+\S+\s+(?P<length>\S+)\s+'
468    r'\S+\s+\S+\s+\S+\s+"(?P<referrer>.*?)"\s+"(?P<user_agent>.*?)"'
469)
470_ICECAST2_LOG_FORMAT = ( _NCSA_EXTENDED_LOG_FORMAT +
471    r'\s+(?P<session_time>[0-9-]+)'
472)
473_ELB_LOG_FORMAT = (
474    r'(?:\S+\s+)?(?P<date>[0-9-]+T[0-9:]+)\.\S+\s+\S+\s+(?P<ip>[\w*.:-]+):\d+\s+\S+:\d+\s+\S+\s+(?P<generation_time_secs>\S+)\s+\S+\s+'
475    r'(?P<status>\d+)\s+\S+\s+\S+\s+(?P<length>\S+)\s+'
476    r'"\S+\s+\w+:\/\/(?P<host>[\w\-\.]*):\d+(?P<path>\/\S*)\s+[^"]+"\s+"(?P<user_agent>[^"]+)"\s+\S+\s+\S+'
477)
478
479_OVH_FORMAT = (
480    r'(?P<ip>\S+)\s+' + _HOST_PREFIX + r'(?P<userid>\S+)\s+\[(?P<date>.*?)\s+(?P<timezone>.*?)\]\s+'
481    r'"\S+\s+(?P<path>.*?)\s+\S+"\s+(?P<status>\S+)\s+(?P<length>\S+)'
482    r'\s+"(?P<referrer>.*?)"\s+"(?P<user_agent>.*?)"'
483)
484
485_HAPROXY_FORMAT = (
486    r'.*:\ (?P<ip>[\w*.]+).*\[(?P<date>.*)\].*\ (?P<status>\b\d{3}\b)\ (?P<length>\d+)\ -.*\"(?P<method>\S+)\ (?P<path>\S+).*'
487)
488
489_GANDI_SIMPLE_HOSTING_FORMAT = (
490    r'(?P<host>[0-9a-zA-Z-_.]+)\s+(?P<ip>[a-zA-Z0-9.]+)\s+\S+\s+(?P<userid>\S+)\s+\[(?P<date>.+?)\s+(?P<timezone>.+?)\]\s+\((?P<generation_time_secs>[0-9a-zA-Z\s]*)\)\s+"(?P<method>[A-Z]+)\s+(?P<path>\S+)\s+(\S+)"\s+(?P<status>[0-9]+)\s+(?P<length>\S+)\s+"(?P<referrer>\S+)"\s+"(?P<user_agent>[^"]+)"'
491)
492
493FORMATS = {
494    'common': RegexFormat('common', _COMMON_LOG_FORMAT),
495    'common_vhost': RegexFormat('common_vhost', _HOST_PREFIX + _COMMON_LOG_FORMAT),
496    'ncsa_extended': RegexFormat('ncsa_extended', _NCSA_EXTENDED_LOG_FORMAT),
497    'common_complete': RegexFormat('common_complete', _HOST_PREFIX + _NCSA_EXTENDED_LOG_FORMAT),
498    'w3c_extended': W3cExtendedFormat(),
499    'amazon_cloudfront': AmazonCloudFrontFormat(),
500    'incapsula_w3c': IncapsulaW3CFormat(),
501    'iis': IisFormat(),
502    'shoutcast': ShoutcastFormat(),
503    's3': RegexFormat('s3', _S3_LOG_FORMAT),
504    'icecast2': RegexFormat('icecast2', _ICECAST2_LOG_FORMAT),
505    'elb': RegexFormat('elb', _ELB_LOG_FORMAT, '%Y-%m-%dT%H:%M:%S'),
506    'nginx_json': JsonFormat('nginx_json'),
507    'ovh': RegexFormat('ovh', _OVH_FORMAT),
508    'haproxy': RegexFormat('haproxy', _HAPROXY_FORMAT, '%d/%b/%Y:%H:%M:%S.%f'),
509    'gandi': RegexFormat('gandi', _GANDI_SIMPLE_HOSTING_FORMAT, '%d/%b/%Y:%H:%M:%S')
510}
511
512##
513## Code.
514##
515
516class StoreDictKeyPair(argparse.Action):
517    def __call__(self, parser, namespace, values, option_string=None):
518        my_dict = getattr(namespace, self.dest, None)
519        if not my_dict:
520            my_dict = {}
521        for kv in values.split(","):
522            k,v = kv.split("=")
523            my_dict[k] = v
524        setattr(namespace, self.dest, my_dict)
525
526class Configuration:
527    """
528    Stores all the configuration options by reading sys.argv and parsing,
529    if needed, the config.inc.php.
530
531    It has 2 attributes: options and filenames.
532    """
533
534    class Error(Exception):
535        pass
536
537    def _create_parser(self):
538        """
539        Initialize and return the OptionParser instance.
540        """
541        parser = argparse.ArgumentParser(
542            # usage='Usage: %prog [options] log_file [ log_file [...] ]',
543            description="Import HTTP access logs to Matomo. "
544                         "log_file is the path to a server access log file (uncompressed, .gz, .bz2, or specify - to read from stdin). "
545                         " You may also import many log files at once (for example set log_file to *.log or *.log.gz)."
546                         " By default, the script will try to produce clean reports and will exclude bots, static files, discard http error and redirects, etc. This is customizable, see below.",
547            epilog="About Matomo Server Log Analytics: https://matomo.org/log-analytics/ "
548                   "              Found a bug? Please create a ticket in https://github.com/matomo-org/matomo-log-analytics/ "
549                   "              Please send your suggestions or successful user story to hello@matomo.org "
550        )
551
552        parser.add_argument('file', type=str, nargs='+')
553
554        # Basic auth user
555        parser.add_argument(
556            '--auth-user', dest='auth_user',
557            help="Basic auth user",
558        )
559        # Basic auth password
560        parser.add_argument(
561            '--auth-password', dest='auth_password',
562            help="Basic auth password",
563        )
564        parser.add_argument(
565            '--debug', '-d', dest='debug', action='count', default=0,
566            help="Enable debug output (specify multiple times for more verbose)",
567        )
568        parser.add_argument(
569            '--debug-tracker', dest='debug_tracker', action='store_true', default=False,
570            help="Appends &debug=1 to tracker requests and prints out the result so the tracker can be debugged. If "
571            "using the log importer results in errors with the tracker or improperly recorded visits, this option can "
572            "be used to find out what the tracker is doing wrong. To see debug tracker output, you must also set the "
573            "[Tracker] debug_on_demand INI config to 1 in your Matomo's config.ini.php file."
574        )
575        parser.add_argument(
576            '--debug-request-limit', dest='debug_request_limit', type=int, default=None,
577            help="Debug option that will exit after N requests are parsed. Can be used w/ --debug-tracker to limit the "
578            "output of a large log file."
579        )
580        parser.add_argument(
581            '--url', dest='matomo_url', required=True,
582            help="REQUIRED Your Matomo server URL, eg. https://example.com/matomo/ or https://analytics.example.net",
583        )
584        parser.add_argument(
585            '--api-url', dest='matomo_api_url',
586            help="This URL will be used to send API requests (use it if your tracker URL differs from UI/API url), "
587            "eg. https://other-example.com/matomo/ or https://analytics-api.example.net",
588        )
589        parser.add_argument(
590            '--tracker-endpoint-path', dest='matomo_tracker_endpoint_path', default='/piwik.php',
591            help="The tracker endpoint path to use when tracking. Defaults to /piwik.php."
592        )
593        parser.add_argument(
594            '--dry-run', dest='dry_run',
595            action='store_true', default=False,
596            help="Perform a trial run with no tracking data being inserted into Matomo",
597        )
598        parser.add_argument(
599            '--show-progress', dest='show_progress',
600            action='store_true', default=hasattr(sys.stdout, 'fileno') and os.isatty(sys.stdout.fileno()),
601            help="Print a progress report X seconds (default: 1, use --show-progress-delay to override)"
602        )
603        parser.add_argument(
604            '--show-progress-delay', dest='show_progress_delay',
605            type=int, default=1,
606            help="Change the default progress delay"
607        )
608        parser.add_argument(
609            '--add-sites-new-hosts', dest='add_sites_new_hosts',
610            action='store_true', default=False,
611            help="When a hostname is found in the log file, but not matched to any website "
612            "in Matomo, automatically create a new website in Matomo with this hostname to "
613            "import the logs"
614        )
615        parser.add_argument(
616            '--idsite', dest='site_id',
617            help= ("When specified, "
618                   "data in the specified log files will be tracked for this Matomo site ID."
619                   " The script will not auto-detect the website based on the log line hostname (new websites will not be automatically created).")
620        )
621        parser.add_argument(
622            '--idsite-fallback', dest='site_id_fallback',
623            help="Default Matomo site ID to use if the hostname doesn't match any "
624            "known Website's URL. New websites will not be automatically created. "
625            "                         Used only if --add-sites-new-hosts or --idsite are not set",
626        )
627        default_config = os.path.abspath(
628            os.path.join(os.path.dirname(__file__),
629            '../../config/config.ini.php'),
630        )
631        parser.add_argument(
632            '--config', dest='config_file', default=default_config,
633            help=(
634                "This is only used when --login and --password is not used. "
635                "Matomo will read the configuration file (default: %(default)s) to "
636                "fetch the Super User token_auth from the config file. "
637            )
638        )
639        parser.add_argument(
640            '--login', dest='login',
641            help="You can manually specify the Matomo Super User login"
642        )
643        parser.add_argument(
644            '--password', dest='password',
645            help="You can manually specify the Matomo Super User password"
646        )
647        parser.add_argument(
648            '--token-auth', dest='matomo_token_auth',
649            help="Matomo user token_auth, the token_auth is found in Matomo > Settings > API. "
650                 "You must use a token_auth that has at least 'admin' or 'super user' permission. "
651                 "If you use a token_auth for a non admin user, your users' IP addresses will not be tracked properly. "
652        )
653
654        parser.add_argument(
655            '--hostname', dest='hostnames', action='append', default=[],
656            help="Accepted hostname (requests with other hostnames will be excluded). "
657            " You may use the star character * "
658            " Example: --hostname=*domain.com"
659            " Can be specified multiple times"
660        )
661        parser.add_argument(
662            '--exclude-path', dest='excluded_paths', action='append', default=[],
663            help="Any URL path matching this exclude-path will not be imported in Matomo. "
664            " You must use the star character *. "
665            " Example: --exclude-path=*/admin/*"
666            " Can be specified multiple times. "
667        )
668        parser.add_argument(
669            '--exclude-path-from', dest='exclude_path_from',
670            help="Each line from this file is a path to exclude. Each path must contain the character * to match a string. (see: --exclude-path)"
671        )
672        parser.add_argument(
673            '--include-path', dest='included_paths', action='append', default=[],
674            help="Paths to include. Can be specified multiple times. If not specified, all paths are included."
675        )
676        parser.add_argument(
677            '--include-path-from', dest='include_path_from',
678            help="Each line from this file is a path to include"
679        )
680        parser.add_argument(
681            '--useragent-exclude', dest='excluded_useragents',
682            action='append', default=[],
683            help="User agents to exclude (in addition to the standard excluded "
684            "user agents). Can be specified multiple times",
685        )
686        parser.add_argument(
687            '--enable-static', dest='enable_static',
688            action='store_true', default=False,
689            help="Track static files (images, css, js, ico, ttf, etc.)"
690        )
691        parser.add_argument(
692            '--enable-bots', dest='enable_bots',
693            action='store_true', default=False,
694            help="Track bots. All bot visits will have a Custom Variable set with name='Bot' and value='$Bot_user_agent_here$'"
695        )
696        parser.add_argument(
697            '--enable-http-errors', dest='enable_http_errors',
698            action='store_true', default=False,
699            help="Track HTTP errors (status code 4xx or 5xx)"
700        )
701        parser.add_argument(
702            '--enable-http-redirects', dest='enable_http_redirects',
703            action='store_true', default=False,
704            help="Track HTTP redirects (status code 3xx except 304)"
705        )
706        parser.add_argument(
707            '--enable-reverse-dns', dest='reverse_dns',
708            action='store_true', default=False,
709            help="Enable reverse DNS, used to generate the 'Providers' report in Matomo. "
710                 "Disabled by default, as it impacts performance"
711        )
712        parser.add_argument(
713            '--strip-query-string', dest='strip_query_string',
714            action='store_true', default=False,
715            help="Strip the query string from the URL"
716        )
717        parser.add_argument(
718            '--query-string-delimiter', dest='query_string_delimiter', default='?',
719            help="The query string delimiter (default: %(default)s)"
720        )
721        parser.add_argument(
722            '--log-format-name', dest='log_format_name', default=None,
723            help=("Access log format to detect (supported are: %s). "
724                  "When not specified, the log format will be autodetected by trying all supported log formats."
725                  % ', '.join(sorted(FORMATS.keys())))
726        )
727        available_regex_groups = ['date', 'path', 'query_string', 'ip', 'user_agent', 'referrer', 'status',
728                                  'length', 'host', 'userid', 'generation_time_milli', 'event_action',
729                                  'event_name', 'timezone', 'session_time']
730        parser.add_argument(
731            '--log-format-regex', dest='log_format_regex', default=None,
732            help="Regular expression used to parse log entries. Regexes must contain named groups for different log fields. "
733                 "Recognized fields include: %s. For an example of a supported Regex, see the source code of this file. "
734                 "Overrides --log-format-name." % (', '.join(available_regex_groups))
735        )
736        parser.add_argument(
737            '--log-date-format', dest='log_date_format', default=None,
738            help="Format string used to parse dates. You can specify any format that can also be specified to "
739                 "the strptime python function."
740        )
741        parser.add_argument(
742            '--log-hostname', dest='log_hostname', default=None,
743            help="Force this hostname for a log format that doesn't include it. All hits "
744            "will seem to come to this host"
745        )
746        parser.add_argument(
747            '--skip', dest='skip', default=0, type=int,
748            help="Skip the n first lines to start parsing/importing data at a given line for the specified log file",
749        )
750        parser.add_argument(
751            '--recorders', dest='recorders', default=1, type=int,
752            help="Number of simultaneous recorders (default: %(default)s). "
753            "It should be set to the number of CPU cores in your server. "
754            "You can also experiment with higher values which may increase performance until a certain point",
755        )
756        parser.add_argument(
757            '--recorder-max-payload-size', dest='recorder_max_payload_size', default=200, type=int,
758            help="Maximum number of log entries to record in one tracking request (default: %(default)s). "
759        )
760        parser.add_argument(
761            '--replay-tracking', dest='replay_tracking',
762            action='store_true', default=False,
763            help="Replay piwik.php requests found in custom logs (only piwik.php requests expected). \nSee https://matomo.org/faq/how-to/faq_17033/"
764        )
765        parser.add_argument(
766            '--replay-tracking-expected-tracker-file', dest='replay_tracking_expected_tracker_file', default=None,
767            help="The expected suffix for tracking request paths. Only logs whose paths end with this will be imported. By default "
768            "requests to the piwik.php file or the matomo.php file will be imported."
769        )
770        parser.add_argument(
771            '--output', dest='output',
772            help="Redirect output (stdout and stderr) to the specified file"
773        )
774        parser.add_argument(
775            '--encoding', dest='encoding', default='utf8',
776            help="Log files encoding (default: %(default)s)"
777        )
778        parser.add_argument(
779            '--disable-bulk-tracking', dest='use_bulk_tracking',
780            default=True, action='store_false',
781            help="Disables use of bulk tracking so recorders record one hit at a time."
782        )
783        parser.add_argument(
784            '--debug-force-one-hit-every-Ns', dest='force_one_action_interval', default=False, type=float,
785            help="Debug option that will force each recorder to record one hit every N secs."
786        )
787        parser.add_argument(
788            '--force-lowercase-path', dest='force_lowercase_path', default=False, action='store_true',
789            help="Make URL path lowercase so paths with the same letters but different cases are "
790                 "treated the same."
791        )
792        parser.add_argument(
793            '--enable-testmode', dest='enable_testmode', default=False, action='store_true',
794            help="If set, it will try to get the token_auth from the matomo_tests directory"
795        )
796        parser.add_argument(
797            '--download-extensions', dest='download_extensions', default=None,
798            help="By default Matomo tracks as Downloads the most popular file extensions. If you set this parameter (format: pdf,doc,...) then files with an extension found in the list will be imported as Downloads, other file extensions downloads will be skipped."
799        )
800        parser.add_argument(
801            '--add-download-extensions', dest='extra_download_extensions', default=None,
802            help="Add extensions that should be treated as downloads. See --download-extensions for more info."
803        )
804        parser.add_argument(
805            '--w3c-map-field', action=StoreDictKeyPair, metavar='KEY=VAL', default={}, dest="custom_w3c_fields",
806            help="Map a custom log entry field in your W3C log to a default one. Use this option to load custom log "
807                 "files that use the W3C extended log format such as those from the Advanced Logging W3C module. Used "
808                 "as, eg, --w3c-map-field my-date=date. Recognized default fields include: %s\n\n"
809                 "Formats that extend the W3C extended log format (like the cloudfront RTMP log format) may define more "
810                 "fields that can be mapped."
811                     % (', '.join(list(W3cExtendedFormat.fields.keys())))
812        )
813        parser.add_argument(
814            '--w3c-time-taken-millisecs', action='store_true', default=False, dest='w3c_time_taken_in_millisecs',
815            help="If set, interprets the time-taken W3C log field as a number of milliseconds. This must be set for importing"
816                 " IIS logs."
817        )
818        parser.add_argument(
819            '--w3c-fields', dest='w3c_fields', default=None,
820            help="Specify the '#Fields:' line for a log file in the W3C Extended log file format. Use this option if "
821                 "your log file doesn't contain the '#Fields:' line which is required for parsing. This option must be used "
822                 "in conjunction with --log-format-name=w3c_extended.\n"
823                 "Example: --w3c-fields='#Fields: date time c-ip ...'"
824        )
825        parser.add_argument(
826            '--w3c-field-regex', action=StoreDictKeyPair, metavar='KEY=VAL', default={}, dest="w3c_field_regexes", type=str,
827            help="Specify a regex for a field in your W3C extended log file. You can use this option to parse fields the "
828                 "importer does not natively recognize and then use one of the --regex-group-to-XXX-cvar options to track "
829                 "the field in a custom variable. For example, specifying --w3c-field-regex=sc-win32-status=(?P<win32_status>\\S+) "
830                 "--regex-group-to-page-cvar=\"win32_status=Windows Status Code\" will track the sc-win32-status IIS field "
831                 "in the 'Windows Status Code' custom variable. Regexes must contain a named group."
832        )
833        parser.add_argument(
834            '--title-category-delimiter', dest='title_category_delimiter', default='/',
835            help="If --enable-http-errors is used, errors are shown in the page titles report. If you have "
836            "changed General.action_title_category_delimiter in your Matomo configuration, you need to set this "
837            "option to the same value in order to get a pretty page titles report."
838        )
839        parser.add_argument(
840            '--dump-log-regex', dest='dump_log_regex', action='store_true', default=False,
841            help="Prints out the regex string used to parse log lines and exists. Can be useful for using formats "
842                 "in newer versions of the script in older versions of the script. The output regex can be used with "
843                 "the --log-format-regex option."
844        )
845
846        parser.add_argument(
847            '--ignore-groups', dest='regex_groups_to_ignore', default=None,
848            help="Comma separated list of regex groups to ignore when parsing log lines. Can be used to, for example, "
849                 "disable normal user id tracking. See documentation for --log-format-regex for list of available "
850                 "regex groups."
851        )
852
853        parser.add_argument(
854            '--regex-group-to-visit-cvar', action=StoreDictKeyPair, metavar='KEY=VAL',dest='regex_group_to_visit_cvars_map', default={},
855            help="Track an attribute through a custom variable with visit scope instead of through Matomo's normal "
856                 "approach. For example, to track usernames as a custom variable instead of through the uid tracking "
857                 "parameter, supply --regex-group-to-visit-cvar=\"userid=User Name\". This will track usernames in a "
858                 "custom variable named 'User Name'. The list of available regex groups can be found in the documentation "
859                 "for --log-format-regex (additional regex groups you may have defined "
860                 "in --log-format-regex can also be used)."
861        )
862        parser.add_argument(
863            '--regex-group-to-page-cvar', action=StoreDictKeyPair, metavar='KEY=VAL', dest='regex_group_to_page_cvars_map', default={},
864            help="Track an attribute through a custom variable with page scope instead of through Matomo's normal "
865                 "approach. For example, to track usernames as a custom variable instead of through the uid tracking "
866                 "parameter, supply --regex-group-to-page-cvar=\"userid=User Name\". This will track usernames in a "
867                 "custom variable named 'User Name'. The list of available regex groups can be found in the documentation "
868                 "for --log-format-regex (additional regex groups you may have defined "
869                 "in --log-format-regex can also be used)."
870        )
871        parser.add_argument(
872            '--track-http-method', dest='track_http_method', default=False,
873            help="Enables tracking of http method as custom page variable if method group is available in log format."
874        )
875        parser.add_argument(
876            '--retry-max-attempts', dest='max_attempts', default=MATOMO_DEFAULT_MAX_ATTEMPTS, type=int,
877            help="The maximum number of times to retry a failed tracking request."
878        )
879        parser.add_argument(
880            '--retry-delay', dest='delay_after_failure', default=MATOMO_DEFAULT_DELAY_AFTER_FAILURE, type=int,
881            help="The number of seconds to wait before retrying a failed tracking request."
882        )
883        parser.add_argument(
884            '--request-timeout', dest='request_timeout', default=DEFAULT_SOCKET_TIMEOUT, type=int,
885            help="The maximum number of seconds to wait before terminating an HTTP request to Matomo."
886        )
887        parser.add_argument(
888            '--include-host', action='append', type=str,
889            help="Only import logs from the specified host(s)."
890        )
891        parser.add_argument(
892            '--exclude-host', action='append', type=str,
893            help="Only import logs that are not from the specified host(s)."
894        )
895        parser.add_argument(
896            '--exclude-older-than', type=self._valid_date, default=None,
897            help="Ignore logs older than the specified date. Exclusive. Date format must be YYYY-MM-DD hh:mm:ss +/-0000. The timezone offset is required."
898        )
899        parser.add_argument(
900            '--exclude-newer-than', type=self._valid_date, default=None,
901            help="Ignore logs newer than the specified date. Exclusive. Date format must be YYYY-MM-DD hh:mm:ss +/-0000. The timezone offset is required."
902        )
903        parser.add_argument(
904            '--add-to-date', dest='seconds_to_add_to_date', default=0, type=int,
905            help="A number of seconds to add to each date value in the log file."
906        )
907        parser.add_argument(
908            '--request-suffix', dest='request_suffix', default=None, type=str, help="Extra parameters to append to tracker and API requests."
909        )
910        parser.add_argument(
911            '--accept-invalid-ssl-certificate',
912            dest='accept_invalid_ssl_certificate', action='store_true',
913            default=False,
914            help="Do not verify the SSL / TLS certificate when contacting the Matomo server."
915        )
916        parser.add_argument(
917            '--php-binary', dest='php_binary', type=str, default='php',
918            help="Specify the PHP binary to use.",
919        )
920        return parser
921
922    def _valid_date(self, value):
923        try:
924            (date_str, timezone) = value.rsplit(' ', 1)
925        except:
926            raise argparse.ArgumentTypeError("Invalid date value '%s'." % value)
927
928        if not re.match('[-+][0-9]{4}', timezone):
929            raise argparse.ArgumentTypeError("Invalid date value '%s': expected valid timzeone like +0100 or -1200, got '%s'" % (value, timezone))
930
931        date = datetime.datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S')
932        date -= TimeHelper.timedelta_from_timezone(timezone)
933
934        return date
935
936    def _parse_args(self, option_parser, argv = None):
937        """
938        Parse the command line args and create self.options and self.filenames.
939        """
940        if not argv:
941            argv = sys.argv[1:]
942
943        self.options = option_parser.parse_args(argv)
944        self.filenames = self.options.file
945
946        if self.options.output:
947            sys.stdout = sys.stderr = open(self.options.output, 'a')
948
949        all_filenames = []
950        for self.filename in self.filenames:
951            if self.filename == '-':
952                all_filenames.append(self.filename)
953            else:
954                all_filenames = all_filenames + sorted(glob.glob(self.filename))
955        self.filenames = all_filenames
956
957        # Configure logging before calling logging.{debug,info}.
958        logging.basicConfig(
959            format='%(asctime)s: [%(levelname)s] %(message)s',
960            level=logging.DEBUG if self.options.debug >= 1 else logging.INFO,
961        )
962
963        self.options.excluded_useragents = set([s.lower() for s in self.options.excluded_useragents])
964
965        if self.options.exclude_path_from:
966            paths = [path.strip() for path in open(self.options.exclude_path_from).readlines()]
967            self.options.excluded_paths.extend(path for path in paths if len(path) > 0)
968        if self.options.excluded_paths:
969            self.options.excluded_paths = set(self.options.excluded_paths)
970            logging.debug('Excluded paths: %s', ' '.join(self.options.excluded_paths))
971
972        if self.options.include_path_from:
973            paths = [path.strip() for path in open(self.options.include_path_from).readlines()]
974            self.options.included_paths.extend(path for path in paths if len(path) > 0)
975        if self.options.included_paths:
976            self.options.included_paths = set(self.options.included_paths)
977            logging.debug('Included paths: %s', ' '.join(self.options.included_paths))
978
979        if self.options.hostnames:
980            logging.debug('Accepted hostnames: %s', ', '.join(self.options.hostnames))
981        else:
982            logging.debug('Accepted hostnames: all')
983
984        if self.options.log_format_regex:
985            self.format = RegexFormat('custom', self.options.log_format_regex, self.options.log_date_format)
986        elif self.options.log_format_name:
987            try:
988                self.format = FORMATS[self.options.log_format_name]
989            except KeyError:
990                fatal_error('invalid log format: %s' % self.options.log_format_name)
991        else:
992            self.format = None
993
994        if not hasattr(self.options, 'custom_w3c_fields'):
995            self.options.custom_w3c_fields = {}
996        elif self.format is not None:
997            # validate custom field mappings
998            for dummy_custom_name, default_name in self.options.custom_w3c_fields.items():
999                if default_name not in type(format).fields:
1000                    fatal_error("custom W3C field mapping error: don't know how to parse and use the '%s' field" % default_name)
1001                    return
1002
1003        if hasattr(self.options, 'w3c_field_regexes'):
1004            # make sure each custom w3c field regex has a named group
1005            for field_name, field_regex in self.options.w3c_field_regexes.items():
1006                if '(?P<' not in field_regex:
1007                    fatal_error("cannot find named group in custom w3c field regex '%s' for field '%s'" % (field_regex, field_name))
1008                    return
1009
1010
1011        if not (self.options.matomo_url.startswith('http://') or self.options.matomo_url.startswith('https://')):
1012            self.options.matomo_url = 'http://' + self.options.matomo_url
1013        logging.debug('Matomo Tracker API URL is: %s', self.options.matomo_url)
1014
1015        if not self.options.matomo_api_url:
1016            self.options.matomo_api_url = self.options.matomo_url
1017
1018        if not (self.options.matomo_api_url.startswith('http://') or self.options.matomo_api_url.startswith('https://')):
1019            self.options.matomo_api_url = 'http://' + self.options.matomo_api_url
1020        logging.debug('Matomo Analytics API URL is: %s', self.options.matomo_api_url)
1021
1022        if self.options.recorders < 1:
1023            self.options.recorders = 1
1024
1025        download_extensions = DOWNLOAD_EXTENSIONS
1026        if self.options.download_extensions:
1027            download_extensions = set(self.options.download_extensions.split(','))
1028
1029        if self.options.extra_download_extensions:
1030            download_extensions.update(self.options.extra_download_extensions.split(','))
1031        self.options.download_extensions = download_extensions
1032
1033        if self.options.regex_groups_to_ignore:
1034            self.options.regex_groups_to_ignore = set(self.options.regex_groups_to_ignore.split(','))
1035
1036    def __init__(self, argv = None):
1037        self._parse_args(self._create_parser(), argv)
1038
1039    def _get_token_auth(self):
1040        """
1041        If the token auth is not specified in the options, get it from Matomo.
1042        """
1043        # Get superuser login/password from the options.
1044        logging.debug('No token-auth specified')
1045
1046        if self.options.login and self.options.password:
1047            matomo_login = self.options.login
1048            matomo_password = self.options.password
1049
1050            logging.debug('Using credentials: (login = %s, using password = %s)', matomo_login, 'YES' if matomo_password else 'NO')
1051            try:
1052                api_result = matomo.call_api('UsersManager.createAppSpecificTokenAuth',
1053                    userLogin=matomo_login,
1054                    passwordConfirmation=matomo_password,
1055                    description='Log importer',
1056                    expireHours='48',
1057                    _token_auth='',
1058                    _url=self.options.matomo_api_url,
1059                )
1060            except urllib.error.URLError as e:
1061                fatal_error('error when fetching token_auth from the API: %s' % e)
1062
1063            try:
1064                return api_result['value']
1065            except KeyError:
1066                # Happens when the credentials are invalid.
1067                message = api_result.get('message')
1068                fatal_error(
1069                    'error fetching authentication token token_auth%s' % (
1070                    ': %s' % message if message else '')
1071                )
1072        else:
1073            # Fallback to the given (or default) configuration file, then
1074            # get the token from the API.
1075            logging.debug(
1076                'No credentials specified, reading them from "%s"',
1077                self.options.config_file,
1078            )
1079            config_file = configparser.RawConfigParser(strict=False)
1080            success = len(config_file.read(self.options.config_file)) > 0
1081            if not success:
1082                fatal_error(
1083                    "the configuration file" + self.options.config_file + " could not be read. Please check permission. This file must be readable by the user running this script to get the authentication token"
1084                )
1085
1086            updatetokenfile = os.path.abspath(
1087                os.path.join(self.options.config_file,
1088                    '../../misc/cron/updatetoken.php'),
1089            )
1090
1091            phpBinary = config.options.php_binary
1092
1093            # Special handling for windows (only if given php binary does not differ from default)
1094            is_windows = sys.platform.startswith('win')
1095            if phpBinary == 'php' and is_windows:
1096                try:
1097                    processWin = subprocess.Popen('where php.exe', stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1098                    [stdout, stderr] = processWin.communicate()
1099                    if processWin.returncode == 0:
1100                        phpBinary = stdout.strip()
1101                    else:
1102                        fatal_error("We couldn't detect PHP. It might help to add your php.exe to the path or alternatively run the importer using the --login and --password option")
1103                except:
1104                    fatal_error("We couldn't detect PHP. You can run the importer using the --login and --password option to fix this issue")
1105
1106            command = [phpBinary, updatetokenfile]
1107            if self.options.enable_testmode:
1108                command.append('--testmode')
1109
1110            hostname = urllib.parse.urlparse( self.options.matomo_url ).hostname
1111            command.append('--matomo-domain=' + hostname )
1112
1113            command = subprocess.list2cmdline(command)
1114
1115#            logging.debug(command);
1116
1117            process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
1118            [stdout, stderr] = process.communicate()
1119            stdout, stderr = stdout.decode(), stderr.decode()
1120            if process.returncode != 0:
1121                fatal_error("`" + command + "` failed with error: " + stderr + ".\nReponse code was: " + str(process.returncode) + ". You can alternatively run the importer using the --login and --password option")
1122
1123            filename = stdout
1124            credentials = open(filename, 'r').readline()
1125            credentials = credentials.split('\t')
1126            return credentials[1]
1127
1128    def get_resolver(self):
1129        if self.options.site_id:
1130            logging.debug('Resolver: static')
1131            return StaticResolver(self.options.site_id)
1132        else:
1133            logging.debug('Resolver: dynamic')
1134            return DynamicResolver()
1135
1136    def init_token_auth(self):
1137        if not self.options.matomo_token_auth:
1138            try:
1139                self.options.matomo_token_auth = self._get_token_auth()
1140            except MatomoHttpBase.Error as e:
1141                fatal_error(e)
1142        logging.debug('Authentication token token_auth is: %s', self.options.matomo_token_auth)
1143
1144
1145class Statistics:
1146    """
1147    Store statistics about parsed logs and recorded entries.
1148    Can optionally print statistics on standard output every second.
1149    """
1150
1151    class Counter:
1152        """
1153        Simple integers cannot be used by multithreaded programs. See:
1154        https://stackoverflow.com/questions/6320107/are-python-ints-thread-safe
1155        """
1156        def __init__(self):
1157            # itertools.count's implementation in C does not release the GIL and
1158            # therefore is thread-safe.
1159            self.counter = itertools.count(1)
1160            self.value = 0
1161
1162        def increment(self):
1163            self.value = next(self.counter)
1164
1165        def advance(self, n):
1166            for i in range(n):
1167                self.increment()
1168
1169        def __str__(self):
1170            return str(int(self.value))
1171
1172    def __init__(self):
1173        self.time_start = None
1174        self.time_stop = None
1175
1176        self.matomo_sites = set()                # sites ID
1177        self.matomo_sites_created = []           # (hostname, site ID)
1178        self.matomo_sites_ignored = set()        # hostname
1179
1180        self.count_lines_parsed = self.Counter()
1181        self.count_lines_recorded = self.Counter()
1182
1183        # requests that the Matomo tracker considered invalid (or failed to track)
1184        self.invalid_lines = []
1185
1186        # Do not match the regexp.
1187        self.count_lines_invalid = self.Counter()
1188        # Were filtered out.
1189        self.count_lines_filtered = self.Counter()
1190        # No site ID found by the resolver.
1191        self.count_lines_no_site = self.Counter()
1192        # Hostname filtered by config.options.hostnames
1193        self.count_lines_hostname_skipped = self.Counter()
1194        # Static files.
1195        self.count_lines_static = self.Counter()
1196        # Ignored user-agents.
1197        self.count_lines_skipped_user_agent = self.Counter()
1198        # Ignored HTTP errors.
1199        self.count_lines_skipped_http_errors = self.Counter()
1200        # Ignored HTTP redirects.
1201        self.count_lines_skipped_http_redirects = self.Counter()
1202        # Downloads
1203        self.count_lines_downloads = self.Counter()
1204        # Ignored downloads when --download-extensions is used
1205        self.count_lines_skipped_downloads = self.Counter()
1206
1207        # Misc
1208        self.dates_recorded = set()
1209        self.monitor_stop = False
1210
1211    def set_time_start(self):
1212        self.time_start = time.time()
1213
1214    def set_time_stop(self):
1215        self.time_stop = time.time()
1216
1217    def _compute_speed(self, value, start, end):
1218        delta_time = end - start
1219        if value == 0:
1220            return 0
1221        if delta_time == 0:
1222            return 'very high!'
1223        else:
1224            return value / delta_time
1225
1226    def _round_value(self, value, base=100):
1227        return round(value * base) / base
1228
1229    def _indent_text(self, lines, level=1):
1230        """
1231        Return an indented text. 'lines' can be a list of lines or a single
1232        line (as a string). One level of indentation is 4 spaces.
1233        """
1234        prefix = ' ' * (4 * level)
1235        if isinstance(lines, str):
1236            return prefix + lines
1237        else:
1238            return '\n'.join(
1239                prefix + line
1240                for line in lines
1241            )
1242
1243    def print_summary(self):
1244        invalid_lines_summary = ''
1245        if self.invalid_lines:
1246            invalid_lines_summary = '''Invalid log lines
1247-----------------
1248
1249The following lines were not tracked by Matomo, either due to a malformed tracker request or error in the tracker:
1250
1251%s
1252
1253''' % textwrap.fill(", ".join(self.invalid_lines), 80)
1254
1255        print(('''
1256%(invalid_lines)sLogs import summary
1257-------------------
1258
1259    %(count_lines_recorded)d requests imported successfully
1260    %(count_lines_downloads)d requests were downloads
1261    %(total_lines_ignored)d requests ignored:
1262        %(count_lines_skipped_http_errors)d HTTP errors
1263        %(count_lines_skipped_http_redirects)d HTTP redirects
1264        %(count_lines_invalid)d invalid log lines
1265        %(count_lines_filtered)d filtered log lines
1266        %(count_lines_no_site)d requests did not match any known site
1267        %(count_lines_hostname_skipped)d requests did not match any --hostname
1268        %(count_lines_skipped_user_agent)d requests done by bots, search engines...
1269        %(count_lines_static)d requests to static resources (css, js, images, ico, ttf...)
1270        %(count_lines_skipped_downloads)d requests to file downloads did not match any --download-extensions
1271
1272Website import summary
1273----------------------
1274
1275    %(count_lines_recorded)d requests imported to %(total_sites)d sites
1276        %(total_sites_existing)d sites already existed
1277        %(total_sites_created)d sites were created:
1278%(sites_created)s
1279    %(total_sites_ignored)d distinct hostnames did not match any existing site:
1280%(sites_ignored)s
1281%(sites_ignored_tips)s
1282
1283Performance summary
1284-------------------
1285
1286    Total time: %(total_time)d seconds
1287    Requests imported per second: %(speed_recording)s requests per second
1288
1289Processing your log data
1290------------------------
1291
1292    In order for your logs to be processed by Matomo, you may need to run the following command:
1293     ./console core:archive --force-all-websites --url='%(url)s'
1294''' % {
1295
1296    'count_lines_recorded': self.count_lines_recorded.value,
1297    'count_lines_downloads': self.count_lines_downloads.value,
1298    'total_lines_ignored': sum([
1299            self.count_lines_invalid.value,
1300            self.count_lines_filtered.value,
1301            self.count_lines_skipped_user_agent.value,
1302            self.count_lines_skipped_http_errors.value,
1303            self.count_lines_skipped_http_redirects.value,
1304            self.count_lines_static.value,
1305            self.count_lines_skipped_downloads.value,
1306            self.count_lines_no_site.value,
1307            self.count_lines_hostname_skipped.value,
1308        ]),
1309    'count_lines_invalid': self.count_lines_invalid.value,
1310    'count_lines_filtered': self.count_lines_filtered.value,
1311    'count_lines_skipped_user_agent': self.count_lines_skipped_user_agent.value,
1312    'count_lines_skipped_http_errors': self.count_lines_skipped_http_errors.value,
1313    'count_lines_skipped_http_redirects': self.count_lines_skipped_http_redirects.value,
1314    'count_lines_static': self.count_lines_static.value,
1315    'count_lines_skipped_downloads': self.count_lines_skipped_downloads.value,
1316    'count_lines_no_site': self.count_lines_no_site.value,
1317    'count_lines_hostname_skipped': self.count_lines_hostname_skipped.value,
1318    'total_sites': len(self.matomo_sites),
1319    'total_sites_existing': len(self.matomo_sites - set(site_id for hostname, site_id in self.matomo_sites_created)),
1320    'total_sites_created': len(self.matomo_sites_created),
1321    'sites_created': self._indent_text(
1322            ['%s (ID: %d)' % (hostname, site_id) for hostname, site_id in self.matomo_sites_created],
1323            level=3,
1324        ),
1325    'total_sites_ignored': len(self.matomo_sites_ignored),
1326    'sites_ignored': self._indent_text(
1327            self.matomo_sites_ignored, level=3,
1328        ),
1329    'sites_ignored_tips': '''
1330        TIPs:
1331         - if one of these hosts is an alias host for one of the websites
1332           in Matomo, you can add this host as an "Alias URL" in Settings > Websites.
1333         - use --add-sites-new-hosts if you wish to automatically create
1334           one website for each of these hosts in Matomo rather than discarding
1335           these requests.
1336         - use --idsite-fallback to force all these log lines with a new hostname
1337           to be recorded in a specific idsite (for example for troubleshooting/visualizing the data)
1338         - use --idsite to force all lines in the specified log files
1339           to be all recorded in the specified idsite
1340         - or you can also manually create a new Website in Matomo with the URL set to this hostname
1341''' if self.matomo_sites_ignored else '',
1342    'total_time': self.time_stop - self.time_start,
1343    'speed_recording': self._round_value(self._compute_speed(
1344            self.count_lines_recorded.value,
1345            self.time_start, self.time_stop,
1346        )),
1347    'url': config.options.matomo_api_url,
1348    'invalid_lines': invalid_lines_summary
1349}))
1350
1351    ##
1352    ## The monitor is a thread that prints a short summary each second.
1353    ##
1354
1355    def _monitor(self):
1356        latest_total_recorded = 0
1357        while not self.monitor_stop:
1358            current_total = stats.count_lines_recorded.value
1359            time_elapsed = time.time() - self.time_start
1360            print(('%d lines parsed, %d lines recorded, %d records/sec (avg), %d records/sec (current)' % (
1361                stats.count_lines_parsed.value,
1362                current_total,
1363                current_total / time_elapsed if time_elapsed != 0 else 0,
1364                (current_total - latest_total_recorded) / config.options.show_progress_delay,
1365            )))
1366            latest_total_recorded = current_total
1367            time.sleep(config.options.show_progress_delay)
1368
1369    def start_monitor(self):
1370        t = threading.Thread(target=self._monitor)
1371        t.daemon = True
1372        t.start()
1373
1374    def stop_monitor(self):
1375        self.monitor_stop = True
1376
1377class TimeHelper:
1378
1379    @staticmethod
1380    def timedelta_from_timezone(timezone):
1381        timezone = int(timezone)
1382        sign = 1 if timezone >= 0 else -1
1383        n = abs(timezone)
1384
1385        hours = int(n / 100) * sign
1386        minutes = n % 100 * sign
1387
1388        return datetime.timedelta(hours=hours, minutes=minutes)
1389
1390class UrlHelper:
1391
1392    @staticmethod
1393    def convert_array_args(args):
1394        """
1395        Converts PHP deep query param arrays (eg, w/ names like hsr_ev[abc][0][]=value) into a nested list/dict
1396        structure that will convert correctly to JSON.
1397        """
1398
1399        final_args = collections.OrderedDict()
1400        for key, value in args.items():
1401            indices = key.split('[')
1402            if '[' in key:
1403                # contains list of all indices, eg for abc[def][ghi][] = 123, indices would be ['abc', 'def', 'ghi', '']
1404                indices = [i.rstrip(']') for i in indices]
1405
1406                # navigate the multidimensional array final_args, creating lists/dicts when needed, using indices
1407                element = final_args
1408                for i in range(0, len(indices) - 1):
1409                    idx = indices[i]
1410
1411                    # if there's no next key, then this element is a list, otherwise a dict
1412                    element_type = list if not indices[i + 1] else dict
1413                    if idx not in element or not isinstance(element[idx], element_type):
1414                        element[idx] = element_type()
1415
1416                    element = element[idx]
1417
1418                # set the value in the final container we navigated to
1419                if not indices[-1]: # last indice is '[]'
1420                    element.append(value)
1421                else: # last indice has a key, eg, '[abc]'
1422                    element[indices[-1]] = value
1423            else:
1424                final_args[key] = value
1425
1426        return UrlHelper._convert_dicts_to_arrays(final_args)
1427
1428    @staticmethod
1429    def _convert_dicts_to_arrays(d):
1430        # convert dicts that have contiguous integer keys to arrays
1431        for key, value in d.items():
1432            if not isinstance(value, dict):
1433                continue
1434
1435            if UrlHelper._has_contiguous_int_keys(value):
1436                d[key] = UrlHelper._convert_dict_to_array(value)
1437            else:
1438                d[key] = UrlHelper._convert_dicts_to_arrays(value)
1439
1440        return d
1441
1442    @staticmethod
1443    def _has_contiguous_int_keys(d):
1444        for i in range(0, len(d)):
1445            if str(i) not in d:
1446                return False
1447        return True
1448
1449    @staticmethod
1450    def _convert_dict_to_array(d):
1451        result = []
1452        for i in range(0, len(d)):
1453            result.append(d[str(i)])
1454        return result
1455
1456class MatomoHttpBase:
1457    class Error(Exception):
1458
1459        def __init__(self, message, code = None):
1460            super(MatomoHttpBase.Error, self).__init__(message)
1461
1462            self.code = code
1463
1464
1465class MatomoHttpUrllib(MatomoHttpBase):
1466    """
1467    Make requests to Matomo.
1468    """
1469
1470    class RedirectHandlerWithLogging(urllib.request.HTTPRedirectHandler):
1471        """
1472        Special implementation of HTTPRedirectHandler that logs redirects in debug mode
1473        to help users debug system issues.
1474        """
1475
1476        def redirect_request(self, req, fp, code, msg, hdrs, newurl):
1477            logging.debug("Request redirected (code: %s) to '%s'" % (code, newurl))
1478
1479            return urllib.request.HTTPRedirectHandler.redirect_request(self, req, fp, code, msg, hdrs, newurl)
1480
1481    def _call(self, path, args, headers=None, url=None, data=None):
1482        """
1483        Make a request to the Matomo site. It is up to the caller to format
1484        arguments, to embed authentication, etc.
1485        """
1486        if url is None:
1487            url = config.options.matomo_url
1488        headers = headers or {}
1489
1490        if data is None:
1491            # If Content-Type isn't defined, PHP do not parse the request's body.
1492            headers['Content-type'] = 'application/x-www-form-urlencoded'
1493            data = urllib.parse.urlencode(args)
1494        elif not isinstance(data, str) and headers['Content-type'] == 'application/json':
1495            data = json.dumps(data)
1496
1497            if args:
1498                path = path + '?' + urllib.parse.urlencode(args)
1499
1500        if config.options.request_suffix:
1501            path = path + ('&' if '?' in path else '?') + config.options.request_suffix
1502
1503        headers['User-Agent'] = 'Matomo/LogImport'
1504
1505        try:
1506            timeout = config.options.request_timeout
1507        except:
1508            timeout = None # the config global object may not be created at this point
1509
1510        request = urllib.request.Request(url + path, data.encode("utf-8"), headers)
1511
1512        # Handle basic auth if auth_user set
1513        try:
1514            auth_user = config.options.auth_user
1515            auth_password = config.options.auth_password
1516        except:
1517            auth_user = None
1518            auth_password = None
1519
1520        if auth_user is not None:
1521            base64string = base64.encodebytes('{}:{}'.format(auth_user, auth_password).encode()).decode().replace('\n', '')
1522            request.add_header("Authorization", "Basic %s" % base64string)
1523
1524        # Use non-default SSL context if invalid certificates shall be
1525        # accepted.
1526        if config.options.accept_invalid_ssl_certificate and \
1527                sys.version_info >= (2, 7, 9):
1528            ssl_context = ssl.create_default_context()
1529            ssl_context.check_hostname = False
1530            ssl_context.verify_mode = ssl.CERT_NONE
1531            https_handler_args = {'context': ssl_context}
1532        else:
1533            https_handler_args = {}
1534        opener = urllib.request.build_opener(
1535            self.RedirectHandlerWithLogging(),
1536            urllib.request.HTTPSHandler(**https_handler_args))
1537        response = opener.open(request, timeout = timeout)
1538        encoding = response.info().get_content_charset('utf-8')
1539        result = response.read()
1540        response.close()
1541        return result.decode(encoding)
1542
1543    def _call_api(self, method, **kwargs):
1544        """
1545        Make a request to the Matomo API taking care of authentication, body
1546        formatting, etc.
1547        """
1548        args = {
1549            'module' : 'API',
1550            'format' : 'json',
1551            'method' : method,
1552            'filter_limit' : '-1',
1553        }
1554        # token_auth, by default, is taken from config.
1555        token_auth = kwargs.pop('_token_auth', None)
1556        if token_auth is None:
1557            token_auth = config.options.matomo_token_auth
1558        if token_auth:
1559            args['token_auth'] = token_auth
1560
1561        url = kwargs.pop('_url', None)
1562        if url is None:
1563            url = config.options.matomo_api_url
1564
1565
1566        if kwargs:
1567            args.update(kwargs)
1568
1569        # Convert lists into appropriate format.
1570        # See: https://developer.matomo.org/api-reference/reporting-api#passing-an-array-of-data-as-a-parameter
1571        # Warning: we have to pass the parameters in order: foo[0], foo[1], foo[2]
1572        # and not foo[1], foo[0], foo[2] (it will break Matomo otherwise.)
1573        final_args = []
1574        for key, value in args.items():
1575            if isinstance(value, (list, tuple)):
1576                for index, obj in enumerate(value):
1577                    final_args.append(('%s[%d]' % (key, index), obj))
1578            else:
1579                final_args.append((key, value))
1580
1581
1582#        logging.debug('%s' % final_args)
1583#        logging.debug('%s' % url)
1584
1585        res = self._call('/', final_args, url=url)
1586
1587        try:
1588            return json.loads(res)
1589        except ValueError:
1590            raise urllib.error.URLError('Matomo returned an invalid response: ' + res.decode("utf-8") )
1591
1592    def _call_wrapper(self, func, expected_response, on_failure, *args, **kwargs):
1593        """
1594        Try to make requests to Matomo at most MATOMO_FAILURE_MAX_RETRY times.
1595        """
1596        errors = 0
1597        while True:
1598            try:
1599                response = func(*args, **kwargs)
1600                if expected_response is not None and response != expected_response:
1601                    if on_failure is not None:
1602                        error_message = on_failure(response, kwargs.get('data'))
1603                    else:
1604                        error_message = "didn't receive the expected response. Response was %s " % response
1605
1606                    raise urllib.error.URLError(error_message)
1607                return response
1608            except (urllib.error.URLError, http.client.HTTPException, ValueError, socket.timeout) as e:
1609                logging.info('Error when connecting to Matomo: %s', e)
1610
1611                code = None
1612                if isinstance(e, urllib.error.HTTPError):
1613                    # See Python issue 13211.
1614                    message = 'HTTP Error %s %s' % (e.code, e.msg)
1615                    code = e.code
1616                elif isinstance(e, urllib.error.URLError):
1617                    message = e.reason
1618                else:
1619                    message = str(e)
1620
1621                # decorate message w/ HTTP response, if it can be retrieved
1622                if hasattr(e, 'read'):
1623                    message = message + ", response: " + e.read().decode()
1624
1625                try:
1626                    delay_after_failure = config.options.delay_after_failure
1627                    max_attempts = config.options.max_attempts
1628                except NameError:
1629                    delay_after_failure = MATOMO_DEFAULT_DELAY_AFTER_FAILURE
1630                    max_attempts = MATOMO_DEFAULT_MAX_ATTEMPTS
1631
1632                errors += 1
1633                if errors == max_attempts:
1634                    logging.info("Max number of attempts reached, server is unreachable!")
1635
1636                    raise MatomoHttpBase.Error(message, code)
1637                else:
1638                    logging.info("Retrying request, attempt number %d" % (errors + 1))
1639
1640                    time.sleep(delay_after_failure)
1641
1642    def call(self, path, args, expected_content=None, headers=None, data=None, on_failure=None):
1643        return self._call_wrapper(self._call, expected_content, on_failure, path, args, headers,
1644                                    data=data)
1645
1646    def call_api(self, method, **kwargs):
1647        return self._call_wrapper(self._call_api, None, None, method, **kwargs)
1648
1649##
1650## Resolvers.
1651##
1652## A resolver is a class that turns a hostname into a Matomo site ID.
1653##
1654
1655class StaticResolver:
1656    """
1657    Always return the same site ID, specified in the configuration.
1658    """
1659
1660    def __init__(self, site_id):
1661        self.site_id = site_id
1662        # Go get the main URL
1663        site = matomo.call_api(
1664            'SitesManager.getSiteFromId', idSite=self.site_id
1665        )
1666        if site.get('result') == 'error':
1667            fatal_error(
1668                "cannot get the main URL of this site: %s" % site.get('message')
1669            )
1670        self._main_url = site['main_url']
1671        stats.matomo_sites.add(self.site_id)
1672
1673    def resolve(self, hit):
1674        return (self.site_id, self._main_url)
1675
1676    def check_format(self, format):
1677        pass
1678
1679class DynamicResolver:
1680    """
1681    Use Matomo API to determine the site ID.
1682    """
1683
1684    _add_site_lock = threading.Lock()
1685
1686    def __init__(self):
1687        self._cache = {}
1688        if config.options.replay_tracking:
1689            # get existing sites
1690            self._cache['sites'] = matomo.call_api('SitesManager.getAllSites')
1691
1692    def _get_site_id_from_hit_host(self, hit):
1693        return matomo.call_api(
1694            'SitesManager.getSitesIdFromSiteUrl',
1695            url=hit.host,
1696        )
1697
1698    def _add_site(self, hit):
1699        main_url = 'http://' + hit.host
1700        DynamicResolver._add_site_lock.acquire()
1701
1702        try:
1703            # After we obtain the lock, make sure the site hasn't already been created.
1704            res = self._get_site_id_from_hit_host(hit)
1705            if res:
1706                return res[0]['idsite']
1707
1708            # The site doesn't exist.
1709            logging.debug('No Matomo site found for the hostname: %s', hit.host)
1710            if config.options.site_id_fallback is not None:
1711                logging.debug('Using default site for hostname: %s', hit.host)
1712                return config.options.site_id_fallback
1713            elif config.options.add_sites_new_hosts:
1714                if config.options.dry_run:
1715                    # Let's just return a fake ID.
1716                    return 0
1717                logging.debug('Creating a Matomo site for hostname %s', hit.host)
1718                result = matomo.call_api(
1719                    'SitesManager.addSite',
1720                    siteName=hit.host,
1721                    urls=[main_url],
1722                )
1723                if result.get('result') == 'error':
1724                    logging.error("Couldn't create a Matomo site for host %s: %s",
1725                        hit.host, result.get('message'),
1726                    )
1727                    return None
1728                else:
1729                    site_id = result['value']
1730                    stats.matomo_sites_created.append((hit.host, site_id))
1731                    return site_id
1732            else:
1733                # The site doesn't exist, we don't want to create new sites and
1734                # there's no default site ID. We thus have to ignore this hit.
1735                return None
1736        finally:
1737            DynamicResolver._add_site_lock.release()
1738
1739    def _resolve(self, hit):
1740        res = self._get_site_id_from_hit_host(hit)
1741        if res:
1742            # The site already exists.
1743            site_id = res[0]['idsite']
1744        else:
1745            site_id = self._add_site(hit)
1746        if site_id is not None:
1747            stats.matomo_sites.add(site_id)
1748        return site_id
1749
1750    def _resolve_when_replay_tracking(self, hit):
1751        """
1752        If parsed site ID found in the _cache['sites'] return site ID and main_url,
1753        otherwise return (None, None) tuple.
1754        """
1755        site_id = hit.args['idsite']
1756        if site_id in self._cache['sites']:
1757            stats.matomo_sites.add(site_id)
1758            return (site_id, self._cache['sites'][site_id]['main_url'])
1759        else:
1760            return (None, None)
1761
1762    def _resolve_by_host(self, hit):
1763        """
1764        Returns the site ID and site URL for a hit based on the hostname.
1765        """
1766        try:
1767            site_id = self._cache[hit.host]
1768        except KeyError:
1769            logging.debug(
1770                'Site ID for hostname %s not in cache', hit.host
1771            )
1772            site_id = self._resolve(hit)
1773            logging.debug('Site ID for hostname %s: %s', hit.host, site_id)
1774            self._cache[hit.host] = site_id
1775        return (site_id, 'http://' + hit.host)
1776
1777    def resolve(self, hit):
1778        """
1779        Return the site ID from the cache if found, otherwise call _resolve.
1780        If replay_tracking option is enabled, call _resolve_when_replay_tracking.
1781        """
1782        if config.options.replay_tracking:
1783            # We only consider requests with piwik.php which don't need host to be imported
1784            return self._resolve_when_replay_tracking(hit)
1785        else:
1786            # Workaround for empty Host bug issue #126
1787            if hit.host.strip() == '':
1788                hit.host = 'no-hostname-found-in-log'
1789            return self._resolve_by_host(hit)
1790
1791    def check_format(self, format):
1792        if config.options.replay_tracking:
1793            pass
1794        elif format.regex is not None and 'host' not in format.regex.groupindex and not config.options.log_hostname:
1795            fatal_error(
1796                "the selected log format doesn't include the hostname: you must "
1797                "specify the Matomo site ID with the --idsite argument"
1798            )
1799
1800class Recorder:
1801    """
1802    A Recorder fetches hits from the Queue and inserts them into Matomo using
1803    the API.
1804    """
1805
1806    recorders = []
1807
1808    def __init__(self):
1809        self.queue = queue.Queue(maxsize=2)
1810
1811        # if bulk tracking disabled, make sure we can store hits outside of the Queue
1812        if not config.options.use_bulk_tracking:
1813            self.unrecorded_hits = []
1814
1815    @classmethod
1816    def launch(cls, recorder_count):
1817        """
1818        Launch a bunch of Recorder objects in a separate thread.
1819        """
1820        for i in range(recorder_count):
1821            recorder = Recorder()
1822            cls.recorders.append(recorder)
1823
1824            run = recorder._run_bulk if config.options.use_bulk_tracking else recorder._run_single
1825            t = threading.Thread(target=run)
1826
1827            t.daemon = True
1828            t.start()
1829            logging.debug('Launched recorder')
1830
1831    @classmethod
1832    def add_hits(cls, all_hits):
1833        """
1834        Add a set of hits to the recorders queue.
1835        """
1836        # Organize hits so that one client IP will always use the same queue.
1837        # We have to do this so visits from the same IP will be added in the right order.
1838        hits_by_client = [[] for r in cls.recorders]
1839        for hit in all_hits:
1840            hits_by_client[hit.get_visitor_id_hash() % len(cls.recorders)].append(hit)
1841
1842        for i, recorder in enumerate(cls.recorders):
1843            recorder.queue.put(hits_by_client[i])
1844
1845    @classmethod
1846    def wait_empty(cls):
1847        """
1848        Wait until all recorders have an empty queue.
1849        """
1850        for recorder in cls.recorders:
1851            recorder._wait_empty()
1852
1853    def _run_bulk(self):
1854        while True:
1855            try:
1856                hits = self.queue.get()
1857            except:
1858                # TODO: we should log something here, however when this happens, logging.etc will throw
1859                return
1860
1861            if len(hits) > 0:
1862                try:
1863                    self._record_hits(hits)
1864                except MatomoHttpBase.Error as e:
1865                    fatal_error(e, hits[0].filename, hits[0].lineno) # approximate location of error
1866            self.queue.task_done()
1867
1868    def _run_single(self):
1869        while True:
1870            if config.options.force_one_action_interval != False:
1871                time.sleep(config.options.force_one_action_interval)
1872
1873            if len(self.unrecorded_hits) > 0:
1874                hit = self.unrecorded_hits.pop(0)
1875
1876                try:
1877                    self._record_hits([hit])
1878                except MatomoHttpBase.Error as e:
1879                    fatal_error(e, hit.filename, hit.lineno)
1880            else:
1881                self.unrecorded_hits = self.queue.get()
1882                self.queue.task_done()
1883
1884    def _wait_empty(self):
1885        """
1886        Wait until the queue is empty.
1887        """
1888        while True:
1889            if self.queue.empty():
1890                # We still have to wait for the last queue item being processed
1891                # (queue.empty() returns True before queue.task_done() is
1892                # called).
1893                self.queue.join()
1894                return
1895            time.sleep(1)
1896
1897    def date_to_matomo(self, date):
1898        date, time = date.isoformat(sep=' ').split()
1899        return '%s %s' % (date, time.replace('-', ':'))
1900
1901    def _get_hit_args(self, hit):
1902        """
1903        Returns the args used in tracking a hit, without the token_auth.
1904        """
1905        site_id, main_url = resolver.resolve(hit)
1906        if site_id is None:
1907            # This hit doesn't match any known Matomo site.
1908            if config.options.replay_tracking:
1909                stats.matomo_sites_ignored.add('unrecognized site ID %s' % hit.args.get('idsite'))
1910            else:
1911                stats.matomo_sites_ignored.add(hit.host)
1912            stats.count_lines_no_site.increment()
1913            return
1914
1915        stats.dates_recorded.add(hit.date.date())
1916
1917        path = hit.path
1918        if hit.query_string and not config.options.strip_query_string:
1919            path += config.options.query_string_delimiter + hit.query_string
1920
1921        # only prepend main url / host if it's a path
1922        url_prefix = self._get_host_with_protocol(hit.host, main_url) if hasattr(hit, 'host') else main_url
1923        url = (url_prefix if path.startswith('/') else '') + path[:1024]
1924
1925        # handle custom variables before generating args dict
1926        if config.options.enable_bots:
1927            if hit.is_robot:
1928                hit.add_visit_custom_var("Bot", hit.user_agent)
1929            else:
1930                hit.add_visit_custom_var("Not-Bot", hit.user_agent)
1931
1932        hit.add_page_custom_var("HTTP-code", hit.status)
1933
1934        args = {
1935            'rec': '1',
1936            'apiv': '1',
1937            'url': url,
1938            'urlref': hit.referrer[:1024],
1939            'cip': hit.ip,
1940            'cdt': self.date_to_matomo(hit.date),
1941            'idsite': site_id,
1942            'queuedtracking': '0',
1943            'dp': '0' if config.options.reverse_dns else '1',
1944            'ua': hit.user_agent
1945        }
1946
1947        if config.options.replay_tracking:
1948            # prevent request to be force recorded when option replay-tracking
1949            args['rec'] = '0'
1950
1951        # idsite is already determined by resolver
1952        if 'idsite' in hit.args:
1953            del hit.args['idsite']
1954
1955        args.update(hit.args)
1956
1957        if hit.is_download:
1958            args['download'] = args['url']
1959
1960        if config.options.enable_bots:
1961            args['bots'] = '1'
1962
1963        if hit.is_error or hit.is_redirect:
1964            args['action_name'] = '%s%sURL = %s%s' % (
1965                hit.status,
1966                config.options.title_category_delimiter,
1967                urllib.parse.quote(args['url'], ''),
1968                ("%sFrom = %s" % (
1969                    config.options.title_category_delimiter,
1970                    urllib.parse.quote(args['urlref'], '')
1971                ) if args['urlref'] != ''  else '')
1972            )
1973
1974        if hit.generation_time_milli > 0:
1975            args['pf_srv'] = int(hit.generation_time_milli)
1976
1977        if hit.event_category and hit.event_action:
1978            args['e_c'] = hit.event_category
1979            args['e_a'] = hit.event_action
1980
1981            if hit.event_name:
1982                args['e_n'] = hit.event_name
1983
1984        if hit.length:
1985            args['bw_bytes'] = hit.length
1986
1987        # convert custom variable args to JSON
1988        if 'cvar' in args and not isinstance(args['cvar'], str):
1989            args['cvar'] = json.dumps(args['cvar'])
1990
1991        if '_cvar' in args and not isinstance(args['_cvar'], str):
1992            args['_cvar'] = json.dumps(args['_cvar'])
1993
1994        return UrlHelper.convert_array_args(args)
1995
1996    def _get_host_with_protocol(self, host, main_url):
1997        if '://' not in host:
1998            parts = urllib.parse.urlparse(main_url)
1999            host = parts.scheme + '://' + host
2000        return host
2001
2002    def _record_hits(self, hits):
2003        """
2004        Inserts several hits into Matomo.
2005        """
2006        if not config.options.dry_run:
2007            data = {
2008                'token_auth': config.options.matomo_token_auth,
2009                'requests': [self._get_hit_args(hit) for hit in hits]
2010            }
2011            try:
2012                args = {}
2013
2014                if config.options.debug_tracker:
2015                    args['debug'] = '1'
2016
2017                response = matomo.call(
2018                    config.options.matomo_tracker_endpoint_path, args=args,
2019                    expected_content=None,
2020                    headers={'Content-type': 'application/json'},
2021                    data=data,
2022                    on_failure=self._on_tracking_failure
2023                )
2024
2025                if config.options.debug_tracker:
2026                    logging.debug('tracker response:\n%s' % response)
2027
2028                # check for invalid requests
2029                try:
2030                    response = json.loads(response)
2031                except:
2032                    logging.info("bulk tracking returned invalid JSON")
2033
2034                    # don't display the tracker response if we're debugging the tracker.
2035                    # debug tracker output will always break the normal JSON output.
2036                    if not config.options.debug_tracker:
2037                        logging.info("tracker response:\n%s" % response)
2038
2039                    response = {}
2040
2041                if ('invalid_indices' in response and isinstance(response['invalid_indices'], list) and
2042                    response['invalid_indices']):
2043                    invalid_count = len(response['invalid_indices'])
2044
2045                    invalid_lines = [str(hits[index].lineno) for index in response['invalid_indices']]
2046                    invalid_lines_str = ", ".join(invalid_lines)
2047
2048                    stats.invalid_lines.extend(invalid_lines)
2049
2050                    logging.info("The Matomo tracker identified %s invalid requests on lines: %s" % (invalid_count, invalid_lines_str))
2051                elif 'invalid' in response and response['invalid'] > 0:
2052                    logging.info("The Matomo tracker identified %s invalid requests." % response['invalid'])
2053            except MatomoHttpBase.Error as e:
2054                # if the server returned 400 code, BulkTracking may not be enabled
2055                if e.code == 400:
2056                    fatal_error("Server returned status 400 (Bad Request).\nIs the BulkTracking plugin disabled?", hits[0].filename, hits[0].lineno)
2057
2058                raise
2059
2060        stats.count_lines_recorded.advance(len(hits))
2061
2062    def _is_json(self, result):
2063        try:
2064            json.loads(result)
2065            return True
2066        except ValueError:
2067            return False
2068
2069    def _on_tracking_failure(self, response, data):
2070        """
2071        Removes the successfully tracked hits from the request payload so
2072        they are not logged twice.
2073        """
2074        try:
2075            response = json.loads(response)
2076        except:
2077            # the response should be in JSON, but in case it can't be parsed just try another attempt
2078            logging.debug("cannot parse tracker response, should be valid JSON")
2079            return response
2080
2081        # remove the successfully tracked hits from payload
2082        tracked = response['tracked']
2083        data['requests'] = data['requests'][tracked:]
2084
2085        return response['message']
2086
2087class Hit:
2088    """
2089    It's a simple container.
2090    """
2091    def __init__(self, **kwargs):
2092        for key, value in kwargs.items():
2093            setattr(self, key, value)
2094        super(Hit, self).__init__()
2095
2096        if config.options.force_lowercase_path:
2097            self.full_path = self.full_path.lower()
2098
2099    def get_visitor_id_hash(self):
2100        visitor_id = self.ip
2101
2102        if config.options.replay_tracking:
2103            for param_name_to_use in ['uid', 'cid', '_id', 'cip']:
2104                if param_name_to_use in self.args:
2105                    visitor_id = self.args[param_name_to_use]
2106                    break
2107
2108        return abs(hash(visitor_id))
2109
2110    def add_page_custom_var(self, key, value):
2111        """
2112        Adds a page custom variable to this Hit.
2113        """
2114        self._add_custom_var(key, value, 'cvar')
2115
2116    def add_visit_custom_var(self, key, value):
2117        """
2118        Adds a visit custom variable to this Hit.
2119        """
2120        self._add_custom_var(key, value, '_cvar')
2121
2122    def _add_custom_var(self, key, value, api_arg_name):
2123        if api_arg_name not in self.args:
2124            self.args[api_arg_name] = {}
2125
2126        if isinstance(self.args[api_arg_name], str):
2127            logging.debug("Ignoring custom %s variable addition [ %s = %s ], custom var already set to string." % (api_arg_name, key, value))
2128            return
2129
2130        index = len(self.args[api_arg_name]) + 1
2131        self.args[api_arg_name][index] = [key, value]
2132
2133class Parser:
2134    """
2135    The Parser parses the lines in a specified file and inserts them into
2136    a Queue.
2137    """
2138
2139    def __init__(self):
2140        self.check_methods = [method for name, method
2141                              in inspect.getmembers(self, predicate=inspect.ismethod)
2142                              if name.startswith('check_')]
2143
2144    ## All check_* methods are called for each hit and must return True if the
2145    ## hit can be imported, False otherwise.
2146
2147    def check_hostname(self, hit):
2148        # Check against config.hostnames.
2149        if not hasattr(hit, 'host') or not config.options.hostnames:
2150            return True
2151
2152        # Accept the hostname only if it matches one pattern in the list.
2153        result = any(
2154            fnmatch.fnmatch(hit.host, pattern)
2155            for pattern in config.options.hostnames
2156        )
2157        if not result:
2158            stats.count_lines_hostname_skipped.increment()
2159        return result
2160
2161    def check_static(self, hit):
2162        filename = hit.path.split('/')[-1]
2163
2164        if hit.extension in STATIC_EXTENSIONS or filename in STATIC_FILES:
2165            if config.options.enable_static:
2166                hit.is_download = True
2167                return True
2168            else:
2169                stats.count_lines_static.increment()
2170                return False
2171        return True
2172
2173    def check_download(self, hit):
2174        if hit.extension in config.options.download_extensions:
2175            stats.count_lines_downloads.increment()
2176            hit.is_download = True
2177            return True
2178        # the file is not in the white-listed downloads
2179        # if it's a know download file, we shall skip it
2180        elif hit.extension in DOWNLOAD_EXTENSIONS:
2181            stats.count_lines_skipped_downloads.increment()
2182            return False
2183        return True
2184
2185    def check_user_agent(self, hit):
2186        user_agent = hit.user_agent.lower()
2187        for s in itertools.chain(EXCLUDED_USER_AGENTS, config.options.excluded_useragents):
2188            if s in user_agent:
2189                if config.options.enable_bots:
2190                    hit.is_robot = True
2191                    return True
2192                else:
2193                    stats.count_lines_skipped_user_agent.increment()
2194                    return False
2195        return True
2196
2197    def check_http_error(self, hit):
2198        if hit.status[0] in ('4', '5'):
2199            if config.options.replay_tracking:
2200                # process error logs for replay tracking, since we don't care if matomo error-ed the first time
2201                return True
2202            elif config.options.enable_http_errors:
2203                hit.is_error = True
2204                return True
2205            else:
2206                stats.count_lines_skipped_http_errors.increment()
2207                return False
2208        return True
2209
2210    def check_http_redirect(self, hit):
2211        if hit.status[0] == '3' and hit.status != '304':
2212            if config.options.enable_http_redirects:
2213                hit.is_redirect = True
2214                return True
2215            else:
2216                stats.count_lines_skipped_http_redirects.increment()
2217                return False
2218        return True
2219
2220    def check_path(self, hit):
2221        for excluded_path in config.options.excluded_paths:
2222            if fnmatch.fnmatch(hit.path, excluded_path):
2223                return False
2224        # By default, all paths are included.
2225        if config.options.included_paths:
2226           for included_path in config.options.included_paths:
2227               if fnmatch.fnmatch(hit.path, included_path):
2228                   return True
2229           return False
2230        return True
2231
2232    @staticmethod
2233    def check_format(lineOrFile):
2234        format = False
2235        format_groups = 0
2236        for name, candidate_format in FORMATS.items():
2237            logging.debug("Check format %s", name)
2238
2239            # skip auto detection for formats that can't be detected automatically
2240            if name == 'ovh':
2241                continue
2242
2243            match = None
2244            try:
2245                if isinstance(lineOrFile, str):
2246                    match = candidate_format.check_format_line(lineOrFile)
2247                else:
2248                    match = candidate_format.check_format(lineOrFile)
2249            except Exception:
2250                logging.debug('Error in format checking: %s', traceback.format_exc())
2251                pass
2252
2253            if match:
2254                logging.debug('Format %s matches', name)
2255
2256                # compare format groups if this *BaseFormat has groups() method
2257                try:
2258                    # if there's more info in this match, use this format
2259                    match_groups = len(match.groups())
2260
2261                    logging.debug('Format match contains %d groups' % match_groups)
2262
2263                    if format_groups < match_groups:
2264                        format = candidate_format
2265                        format_groups = match_groups
2266                except AttributeError:
2267                    format = candidate_format
2268
2269            else:
2270                logging.debug('Format %s does not match', name)
2271
2272        # if the format is W3cExtendedFormat, check if the logs are from IIS and if so, issue a warning if the
2273        # --w3c-time-taken-milli option isn't set
2274        if isinstance(format, W3cExtendedFormat):
2275            format.check_for_iis_option()
2276
2277        return format
2278
2279    @staticmethod
2280    def detect_format(file):
2281        """
2282        Return the best matching format for this file, or None if none was found.
2283        """
2284        logging.debug('Detecting the log format')
2285
2286        format = False
2287
2288        # check the format using the file (for formats like the W3cExtendedFormat one)
2289        format = Parser.check_format(file)
2290
2291        # check the format using the first N lines (to avoid irregular ones)
2292        lineno = 0
2293        limit = 100000
2294        while not format and lineno < limit:
2295            line = file.readline()
2296            if not line: # if at eof, don't keep looping
2297                break
2298
2299            lineno = lineno + 1
2300
2301            logging.debug("Detecting format against line %i" % lineno)
2302            format = Parser.check_format(line)
2303
2304        try:
2305            file.seek(0)
2306        except IOError:
2307            pass
2308
2309        if not format:
2310            fatal_error("cannot automatically determine the log format using the first %d lines of the log file. " % limit +
2311                        "\nMaybe try specifying the format with the --log-format-name command line argument." )
2312            return
2313
2314        logging.debug('Format %s is the best match', format.name)
2315        return format
2316
2317    def is_filtered(self, hit):
2318        host = None
2319        if hasattr(hit, 'host'):
2320            host = hit.host
2321        else:
2322            try:
2323                host = urllib.parse.urlparse(hit.path).hostname
2324            except:
2325                pass
2326
2327        if host:
2328            if config.options.exclude_host and len(config.options.exclude_host) > 0 and host in config.options.exclude_host:
2329                return (True, 'host matched --exclude-host')
2330
2331            if config.options.include_host and len(config.options.include_host) > 0 and host not in config.options.include_host:
2332                return (True, 'host did not match --include-host')
2333
2334        if config.options.exclude_older_than and hit.date < config.options.exclude_older_than:
2335            return (True, 'date is older than --exclude-older-than')
2336
2337        if config.options.exclude_newer_than and hit.date > config.options.exclude_newer_than:
2338            return (True, 'date is newer than --exclude-newer-than')
2339
2340        return (False, None)
2341
2342    def parse(self, filename):
2343        """
2344        Parse the specified filename and insert hits in the queue.
2345        """
2346        def invalid_line(line, reason):
2347            stats.count_lines_invalid.increment()
2348            if config.options.debug >= 2:
2349                logging.debug('Invalid line detected (%s): %s' % (reason, line))
2350
2351        def filtered_line(line, reason):
2352            stats.count_lines_filtered.increment()
2353            if config.options.debug >= 2:
2354                logging.debug('Filtered line out (%s): %s' % (reason, line))
2355
2356        if filename == '-':
2357            filename = '(stdin)'
2358            file = sys.stdin
2359        else:
2360            if not os.path.exists(filename):
2361                print("\n=====> Warning: File %s does not exist <=====" % filename, file=sys.stderr)
2362                return
2363            else:
2364                if filename.endswith('.bz2'):
2365                    open_func = bz2.open
2366                elif filename.endswith('.gz'):
2367                    open_func = gzip.open
2368                else:
2369                    open_func = open
2370
2371                file = open_func(filename, mode='rt', encoding=config.options.encoding, errors="surrogateescape")
2372
2373        if config.options.show_progress:
2374            print(('Parsing log %s...' % filename))
2375
2376        if config.format:
2377            # The format was explicitly specified.
2378            format = config.format
2379
2380            if isinstance(format, W3cExtendedFormat):
2381                format.create_regex(file)
2382
2383                if format.regex is None:
2384                    return fatal_error(
2385                        "File is not in the correct format, is there a '#Fields:' line? "
2386                        "If not, use the --w3c-fields option."
2387                    )
2388        else:
2389            # If the file is empty, don't bother.
2390            data = file.read(100)
2391            if len(data.strip()) == 0:
2392                return
2393            try:
2394                file.seek(0)
2395            except IOError:
2396                pass
2397
2398            format = self.detect_format(file)
2399            if format is None:
2400                return fatal_error(
2401                    'Cannot guess the logs format. Please give one using '
2402                    'either the --log-format-name or --log-format-regex option'
2403                )
2404        # Make sure the format is compatible with the resolver.
2405        resolver.check_format(format)
2406
2407        if config.options.dump_log_regex:
2408            logging.info("Using format '%s'." % format.name)
2409            if format.regex:
2410                logging.info("Regex being used: %s" % format.regex.pattern)
2411            else:
2412                logging.info("Format %s does not use a regex to parse log lines." % format.name)
2413            logging.info("--dump-log-regex option used, aborting log import.")
2414            os._exit(0)
2415
2416        valid_lines_count = 0
2417
2418        hits = []
2419        lineno = -1
2420        while True:
2421            line = file.readline()
2422            if not line: break
2423            lineno = lineno + 1
2424
2425            stats.count_lines_parsed.increment()
2426            if stats.count_lines_parsed.value <= config.options.skip:
2427                continue
2428
2429            match = format.match(line)
2430            if not match:
2431                invalid_line(line, 'line did not match')
2432                continue
2433
2434            valid_lines_count = valid_lines_count + 1
2435            if config.options.debug_request_limit and valid_lines_count >= config.options.debug_request_limit:
2436                if len(hits) > 0:
2437                    Recorder.add_hits(hits)
2438                logging.info("Exceeded limit specified in --debug-request-limit, exiting.")
2439                return
2440
2441            hit = Hit(
2442                filename=filename,
2443                lineno=lineno,
2444                status=format.get('status'),
2445                full_path=format.get('path'),
2446                is_download=False,
2447                is_robot=False,
2448                is_error=False,
2449                is_redirect=False,
2450                args={},
2451            )
2452
2453            if config.options.regex_group_to_page_cvars_map:
2454                self._add_custom_vars_from_regex_groups(hit, format, config.options.regex_group_to_page_cvars_map, True)
2455
2456            if config.options.regex_group_to_visit_cvars_map:
2457                self._add_custom_vars_from_regex_groups(hit, format, config.options.regex_group_to_visit_cvars_map, False)
2458
2459            if config.options.regex_groups_to_ignore:
2460                format.remove_ignored_groups(config.options.regex_groups_to_ignore)
2461
2462            # Add http method page cvar
2463            try:
2464                httpmethod = format.get('method')
2465                if config.options.track_http_method and httpmethod != '-':
2466                    hit.add_page_custom_var('HTTP-method', httpmethod)
2467            except:
2468                pass
2469
2470            try:
2471                hit.query_string = format.get('query_string')
2472                hit.path = hit.full_path
2473            except BaseFormatException:
2474                hit.path, _, hit.query_string = hit.full_path.partition(config.options.query_string_delimiter)
2475
2476            # W3cExtendedFormat detaults to - when there is no query string, but we want empty string
2477            if hit.query_string == '-':
2478                hit.query_string = ''
2479
2480            hit.extension = hit.path.rsplit('.')[-1].lower()
2481
2482            try:
2483                hit.referrer = format.get('referrer')
2484
2485                if hit.referrer.startswith('"'):
2486                    hit.referrer = hit.referrer[1:-1]
2487            except BaseFormatException:
2488                hit.referrer = ''
2489            if hit.referrer == '-':
2490                hit.referrer = ''
2491
2492            try:
2493                hit.user_agent = format.get('user_agent')
2494
2495                # in case a format parser included enclosing quotes, remove them so they are not
2496                # sent to Matomo
2497                if hit.user_agent.startswith('"'):
2498                    hit.user_agent = hit.user_agent[1:-1]
2499            except BaseFormatException:
2500                hit.user_agent = ''
2501
2502            hit.ip = format.get('ip')
2503            try:
2504                hit.length = int(format.get('length'))
2505            except (ValueError, BaseFormatException):
2506                # Some lines or formats don't have a length (e.g. 304 redirects, W3C logs)
2507                hit.length = 0
2508
2509            try:
2510                hit.generation_time_milli = float(format.get('generation_time_milli'))
2511            except (ValueError, BaseFormatException):
2512                try:
2513                    hit.generation_time_milli = float(format.get('generation_time_micro')) / 1000
2514                except (ValueError, BaseFormatException):
2515                    try:
2516                        hit.generation_time_milli = float(format.get('generation_time_secs')) * 1000
2517                    except (ValueError, BaseFormatException):
2518                        hit.generation_time_milli = 0
2519
2520            if config.options.log_hostname:
2521                hit.host = config.options.log_hostname
2522            else:
2523                try:
2524                    hit.host = format.get('host').lower().strip('.')
2525
2526                    if hit.host.startswith('"'):
2527                        hit.host = hit.host[1:-1]
2528                except BaseFormatException:
2529                    # Some formats have no host.
2530                    pass
2531
2532            # Add userid
2533            try:
2534                hit.userid = None
2535
2536                userid = format.get('userid')
2537                if userid != '-':
2538                    hit.args['uid'] = hit.userid = userid
2539            except:
2540                pass
2541
2542            # add event info
2543            try:
2544                hit.event_category = hit.event_action = hit.event_name = None
2545
2546                hit.event_category = format.get('event_category')
2547                hit.event_action = format.get('event_action')
2548
2549                hit.event_name = format.get('event_name')
2550                if hit.event_name == '-':
2551                    hit.event_name = None
2552            except:
2553                pass
2554
2555            # Check if the hit must be excluded.
2556            if not all((method(hit) for method in self.check_methods)):
2557                continue
2558
2559            # Parse date.
2560            # We parse it after calling check_methods as it's quite CPU hungry, and
2561            # we want to avoid that cost for excluded hits.
2562            date_string = format.get('date')
2563            try:
2564                hit.date = datetime.datetime.strptime(date_string, format.date_format)
2565                hit.date += datetime.timedelta(seconds = config.options.seconds_to_add_to_date)
2566            except ValueError as e:
2567                invalid_line(line, 'invalid date or invalid format: %s' % str(e))
2568                continue
2569
2570            # Parse timezone and subtract its value from the date
2571            try:
2572                timezone = format.get('timezone').replace(':', '')
2573                if timezone:
2574                    hit.date -= TimeHelper.timedelta_from_timezone(timezone)
2575            except BaseFormatException:
2576                pass
2577            except ValueError:
2578                invalid_line(line, 'invalid timezone')
2579                continue
2580
2581            if config.options.replay_tracking:
2582                # we need a query string and we only consider requests with piwik.php
2583                if not hit.query_string or not self.is_hit_for_tracker(hit):
2584                    invalid_line(line, 'no query string, or ' + hit.path.lower() + ' does not end with piwik.php/matomo.php')
2585                    continue
2586
2587                query_arguments = urllib.parse.parse_qs(hit.query_string)
2588                if not "idsite" in query_arguments:
2589                    invalid_line(line, 'missing idsite')
2590                    continue
2591
2592                hit.args.update((k, v.pop()) for k, v in query_arguments.items())
2593
2594                if config.options.seconds_to_add_to_date:
2595                    for param in ['_idts', '_viewts', '_ects', '_refts']:
2596                        if param in hit.args:
2597                            hit.args[param] = int(hit.args[param]) + config.options.seconds_to_add_to_date
2598
2599            (is_filtered, reason) = self.is_filtered(hit)
2600            if is_filtered:
2601                filtered_line(line, reason)
2602                continue
2603
2604            hits.append(hit)
2605
2606            if len(hits) >= config.options.recorder_max_payload_size * len(Recorder.recorders):
2607                Recorder.add_hits(hits)
2608                hits = []
2609
2610        # add last chunk of hits
2611        if len(hits) > 0:
2612            Recorder.add_hits(hits)
2613
2614    def is_hit_for_tracker(self, hit):
2615        filesToCheck = ['piwik.php', 'matomo.php']
2616        if config.options.replay_tracking_expected_tracker_file:
2617            filesToCheck = [config.options.replay_tracking_expected_tracker_file]
2618
2619        lowerPath = hit.path.lower()
2620        for file in filesToCheck:
2621            if lowerPath.endswith(file):
2622                return True
2623        return False
2624
2625    def _add_custom_vars_from_regex_groups(self, hit, format, groups, is_page_var):
2626        for group_name, custom_var_name in groups.items():
2627            if group_name in format.get_all():
2628                value = format.get(group_name)
2629
2630                # don't track the '-' empty placeholder value
2631                if value == '-':
2632                    continue
2633
2634                if is_page_var:
2635                    hit.add_page_custom_var(custom_var_name, value)
2636                else:
2637                    hit.add_visit_custom_var(custom_var_name, value)
2638
2639def main():
2640    """
2641    Start the importing process.
2642    """
2643    stats.set_time_start()
2644
2645    if config.options.show_progress:
2646        stats.start_monitor()
2647
2648    recorders = Recorder.launch(config.options.recorders)
2649
2650    try:
2651        for filename in config.filenames:
2652            parser.parse(filename)
2653
2654        Recorder.wait_empty()
2655    except KeyboardInterrupt:
2656        pass
2657
2658    stats.set_time_stop()
2659
2660    if config.options.show_progress:
2661        stats.stop_monitor()
2662
2663    stats.print_summary()
2664
2665def fatal_error(error, filename=None, lineno=None):
2666    print('Fatal error: %s' % error, file=sys.stderr)
2667    if filename and lineno is not None:
2668        print((
2669            'You can restart the import of "%s" from the point it failed by '
2670            'specifying --skip=%d on the command line.\n' % (filename, lineno)
2671        ), file=sys.stderr)
2672    os._exit(1)
2673
2674if __name__ == '__main__':
2675    try:
2676        config = Configuration()
2677        # The matomo object depends on the config object, so we have to create
2678        # it after creating the configuration.
2679        matomo = MatomoHttpUrllib()
2680        # The init_token_auth method may need the matomo option, so we must call
2681        # it after creating the matomo object.
2682        config.init_token_auth()
2683        stats = Statistics()
2684        resolver = config.get_resolver()
2685        parser = Parser()
2686        main()
2687        sys.exit(0)
2688    except KeyboardInterrupt:
2689        pass
2690