1# -*- coding: utf-8 -*-
2# vim:et sts=4 sw=4
3#
4# ibus-typing-booster - A completion input method for IBus
5#
6# Copyright (c) 2011-2013 Anish Patil <apatil@redhat.com>
7# Copyright (c) 2012-2018 Mike FABIAN <mfabian@redhat.com>
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#  This program is distributed in the hope that it will be useful,
14# but WITHOUT ANY WARRANTY; without even the implied warranty of
15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16# GNU General Public License for more details.
17#  You should have received a copy of the GNU General Public License
18# along with this program.  If not, see <http://www.gnu.org/licenses/>
19'''
20Module for ibus-typing-booster to access the sqlite3 databases
21'''
22
23from typing import Dict
24from typing import List
25from typing import Tuple
26from typing import Optional
27from typing import Any
28from typing import Callable
29import os
30import os.path as path
31import codecs
32import unicodedata
33import sqlite3
34import time
35import re
36import gzip
37import logging
38import itb_util
39import hunspell_suggest
40
41LOGGER = logging.getLogger('ibus-typing-booster')
42
43DEBUG_LEVEL = int(0)
44
45USER_DATABASE_VERSION = '0.65'
46
47class TabSqliteDb:
48    # pylint: disable=line-too-long
49    '''Phrase databases for ibus-typing-booster
50
51    The phrases table in the database has columns with the names:
52
53    “id”, “input_phrase”, “phrase”, “p_phrase”, “pp_phrase”, “user_freq”, “timestamp”
54
55    There are 2 databases, sysdb, userdb.
56
57    sysdb: “Database” with the suggestions from the hunspell dictionaries
58        user_freq = 0 always.
59
60        Actually there is no Sqlite3 database called “sysdb”, these
61        are the suggestions coming from hunspell_suggest, i.e. from
62        grepping the hunspell dictionaries and from pyhunspell.
63        (Historic note: ibus-typing-booster started as a fork of
64        ibus-table, in ibus-table “sysdb” is a Sqlite3 database
65        which is installed systemwide and readonly for the user)
66
67    user_db: Database on disk where the phrases learned from the user are stored
68        user_freq >= 1: The number of times the user has used this phrase
69    '''
70    # pylint: enable=line-too-long
71    def __init__(self, user_db_file='') -> None:
72        global DEBUG_LEVEL
73        try:
74            DEBUG_LEVEL = int(str(os.getenv('IBUS_TYPING_BOOSTER_DEBUG_LEVEL')))
75        except (TypeError, ValueError):
76            DEBUG_LEVEL = int(0)
77        if DEBUG_LEVEL > 1:
78            LOGGER.debug(
79                'TabSqliteDb.__init__(user_db_file = %s)', user_db_file)
80        self.user_db_file = user_db_file
81        if not self.user_db_file and os.getenv('HOME'):
82            self.user_db_file = path.join(
83                str(os.getenv('HOME')), '.local/share/ibus-typing-booster/user.db')
84        if not self.user_db_file:
85            LOGGER.debug('Falling back to ":memory:" for user.db')
86            self.user_db_file = ':memory:'
87        if (self.user_db_file != ':memory:'
88                and not os.path.isdir(os.path.dirname(self.user_db_file))):
89            os.makedirs(os.path.dirname(self.user_db_file), exist_ok=True)
90        self._phrase_table_column_names = [
91            'id',
92            'input_phrase',
93            'phrase',
94            'p_phrase',
95            'pp_phrase',
96            'user_freq',
97            'timestamp']
98
99        self.old_phrases = []
100
101        self.hunspell_obj = hunspell_suggest.Hunspell(())
102
103        if self.user_db_file != ':memory:':
104            if not os.path.exists(self.user_db_file):
105                LOGGER.info(
106                    'The user database %(udb)s does not exist yet.',
107                    {'udb': self.user_db_file})
108            else:
109                try:
110                    desc = self.get_database_desc(self.user_db_file)
111                    if (desc is None
112                            or desc["version"] != USER_DATABASE_VERSION
113                            or (self.get_number_of_columns_of_phrase_table(
114                                self.user_db_file)
115                                != len(self._phrase_table_column_names))):
116                        LOGGER.info(
117                            'The user database %(udb)s seems incompatible',
118                            {'udb': self.user_db_file})
119                        if desc is None:
120                            LOGGER.info(
121                                'No version information in the database')
122                        elif desc["version"] != USER_DATABASE_VERSION:
123                            LOGGER.info(
124                                'The version of the database does not match '
125                                '(too old or too new?)')
126                            LOGGER.info(
127                                'ibus-typing-booster wants version=%s',
128                                USER_DATABASE_VERSION)
129                            LOGGER.info(
130                                'But the  database actually has version=%s',
131                                desc["version"])
132                        elif (self.get_number_of_columns_of_phrase_table(
133                                self.user_db_file)
134                              != len(self._phrase_table_column_names)):
135                            LOGGER.info(
136                                'The number of columns of the database '
137                                'does not match')
138                            LOGGER.info(
139                                'ibus-typing-booster expects %(col)s columns',
140                                {'col': len(self._phrase_table_column_names)})
141                            LOGGER.info(
142                                'The database actually has %(col)s columns',
143                                {'col':
144                                 self.get_number_of_columns_of_phrase_table(
145                                     self.user_db_file)})
146                        LOGGER.info(
147                            'Trying to recover the phrases from the old, '
148                            'incompatible database')
149                        self.old_phrases = self.extract_user_phrases()
150                        timestamp = time.strftime('-%Y-%m-%d_%H:%M:%S')
151                        LOGGER.info(
152                            'Renaming the incompatible database to "%(name)s"',
153                            {'name': self.user_db_file+timestamp})
154                        if os.path.exists(self.user_db_file):
155                            os.rename(self.user_db_file,
156                                      self.user_db_file+timestamp)
157                        if os.path.exists(self.user_db_file+'-shm'):
158                            os.rename(self.user_db_file+'-shm',
159                                      self.user_db_file+'-shm'+timestamp)
160                        if os.path.exists(self.user_db_file+'-wal'):
161                            os.rename(self.user_db_file+'-wal',
162                                      self.user_db_file+'-wal'+timestamp)
163                        LOGGER.info(
164                            'Creating a new, empty database "%(name)s".',
165                            {'name': self.user_db_file})
166                        self.init_user_db()
167                        LOGGER.info(
168                            'If user phrases were successfully recovered '
169                            'from the old, '
170                            'incompatible database, they will be used to '
171                            'initialize the new database.')
172                    else:
173                        LOGGER.info(
174                            'Compatible database %(db)s found.',
175                            {'db': self.user_db_file})
176                except Exception:
177                    LOGGER.exception(
178                        'Unexpected error trying to find user database.')
179
180        # open user phrase database
181        try:
182            LOGGER.info(
183                'Connect to the database %(name)s.',
184                {'name': self.user_db_file})
185            self.database = sqlite3.connect(self.user_db_file)
186            self.set_database_pragma_options()
187            self.database.executescript(
188                'ATTACH DATABASE "%s" AS user_db;' % self.user_db_file)
189        except Exception:
190            LOGGER.exception(
191                'Could not open the database %(name)s.',
192                {'name': self.user_db_file})
193            timestamp = time.strftime('-%Y-%m-%d_%H:%M:%S')
194            LOGGER.info(
195                'Renaming the incompatible database to "%(name)s".',
196                {'name': self.user_db_file+timestamp})
197            if os.path.exists(self.user_db_file):
198                os.rename(self.user_db_file, self.user_db_file+timestamp)
199            if os.path.exists(self.user_db_file+'-shm'):
200                os.rename(self.user_db_file+'-shm',
201                          self.user_db_file+'-shm'+timestamp)
202            if os.path.exists(self.user_db_file+'-wal'):
203                os.rename(self.user_db_file+'-wal',
204                          self.user_db_file+'-wal'+timestamp)
205            LOGGER.info(
206                'Creating a new, empty database "%(name)s".',
207                {'name': self.user_db_file})
208            self.init_user_db()
209            self.database = sqlite3.connect(self.user_db_file)
210            self.set_database_pragma_options()
211            self.database.executescript(
212                'ATTACH DATABASE "%s" AS user_db;' % self.user_db_file)
213        self.create_tables()
214        if self.old_phrases:
215            sqlargs = []
216            for ophrase in self.old_phrases:
217                sqlargs.append(
218                    {'input_phrase': ophrase[0],
219                     'phrase': ophrase[0],
220                     'p_phrase': '',
221                     'pp_phrase': '',
222                     'user_freq': ophrase[1],
223                     'timestamp': time.time()})
224            sqlstr = '''
225            INSERT INTO user_db.phrases (input_phrase, phrase, p_phrase, pp_phrase, user_freq, timestamp)
226            VALUES (:input_phrase, :phrase, :p_phrase, :pp_phrase, :user_freq, :timestamp)
227            ;'''
228            try:
229                self.database.executemany(sqlstr, sqlargs)
230            except Exception:
231                LOGGER.exception(
232                    'Unexpected error inserting old phrases '
233                    'into the user database.')
234            self.database.commit()
235            self.database.execute('PRAGMA wal_checkpoint;')
236
237        # try create all hunspell-tables in user database
238        self.create_indexes(commit=False)
239        self.generate_userdb_desc()
240
241    def update_phrase(
242            self,
243            input_phrase: str = '',
244            phrase: str = '',
245            p_phrase: str = '',
246            pp_phrase: str = '',
247            user_freq: int = 0,
248            commit: bool = True) -> None:
249        '''
250        update the user frequency of a phrase
251        '''
252        if not input_phrase or not phrase:
253            return
254        input_phrase = itb_util.remove_accents(input_phrase.lower())
255        phrase = unicodedata.normalize(
256            itb_util.NORMALIZATION_FORM_INTERNAL, phrase)
257        p_phrase = itb_util.remove_accents(p_phrase.lower())
258        pp_phrase = itb_util.remove_accents(pp_phrase.lower())
259        sqlstr = '''
260        UPDATE user_db.phrases
261        SET user_freq = :user_freq, timestamp = :timestamp
262        WHERE input_phrase = :input_phrase
263         AND phrase = :phrase AND p_phrase = :p_phrase AND pp_phrase = :pp_phrase
264        ;'''
265        sqlargs = {'user_freq': user_freq,
266                   'input_phrase': input_phrase,
267                   'phrase': phrase,
268                   'p_phrase': p_phrase,
269                   'pp_phrase': pp_phrase,
270                   'timestamp': time.time()}
271        if DEBUG_LEVEL > 1:
272            LOGGER.debug('sqlstr=%s', sqlstr)
273            LOGGER.debug('sqlargs=%s', sqlargs)
274        try:
275            self.database.execute(sqlstr, sqlargs)
276            if commit:
277                self.database.commit()
278        except Exception:
279            LOGGER.exception('Unexpected error updating phrase in user_db.')
280
281    def sync_usrdb(self) -> None:
282        '''
283        Trigger a checkpoint operation.
284        '''
285        if DEBUG_LEVEL > 1:
286            LOGGER.debug('commit and execute checkpoint ...')
287        self.database.commit()
288        self.database.execute('PRAGMA wal_checkpoint;')
289        if DEBUG_LEVEL > 1:
290            LOGGER.debug('commit and execute checkpoint done.')
291
292    def create_tables(self) -> None:
293        '''Create table for the phrases.'''
294        sqlstr = '''CREATE TABLE IF NOT EXISTS user_db.phrases
295                    (id INTEGER PRIMARY KEY,
296                    input_phrase TEXT, phrase TEXT, p_phrase TEXT, pp_phrase TEXT,
297                    user_freq INTEGER, timestamp REAL);'''
298        self.database.execute(sqlstr)
299        self.database.commit()
300
301    def add_phrase(
302            self,
303            input_phrase: str = '',
304            phrase: str = '',
305            p_phrase: str = '',
306            pp_phrase: str = '',
307            user_freq: int = 0,
308            commit: bool = True) -> None:
309        '''
310        Add phrase to database
311        '''
312        if DEBUG_LEVEL > 1:
313            LOGGER.debug(
314                'input_phrase=%s phrase=%s user_freq=%s ',
315                input_phrase.encode('UTF-8'),
316                phrase.encode('UTF-8'),
317                user_freq)
318        if not input_phrase or not phrase:
319            return
320        input_phrase = itb_util.remove_accents(input_phrase.lower())
321        phrase = unicodedata.normalize(
322            itb_util.NORMALIZATION_FORM_INTERNAL, phrase)
323        p_phrase = itb_util.remove_accents(p_phrase.lower())
324        pp_phrase = itb_util.remove_accents(pp_phrase.lower())
325        select_sqlstr = '''
326        SELECT * FROM user_db.phrases
327        WHERE input_phrase = :input_phrase
328        AND phrase = :phrase AND p_phrase = :p_phrase AND pp_phrase = :pp_phrase
329        ;'''
330        select_sqlargs = {
331            'input_phrase': input_phrase,
332            'phrase': phrase,
333            'p_phrase': p_phrase,
334            'pp_phrase': pp_phrase}
335        if self.database.execute(select_sqlstr, select_sqlargs).fetchall():
336            # there is already such a phrase, i.e. add_phrase was called
337            # in error, do nothing to avoid duplicate entries.
338            return
339
340        insert_sqlstr = '''
341        INSERT INTO user_db.phrases
342        (input_phrase, phrase, p_phrase, pp_phrase, user_freq, timestamp)
343        VALUES (:input_phrase, :phrase, :p_phrase, :pp_phrase, :user_freq, :timestamp)
344        ;'''
345        insert_sqlargs = {'input_phrase': input_phrase,
346                          'phrase': phrase,
347                          'p_phrase': p_phrase,
348                          'pp_phrase': pp_phrase,
349                          'user_freq': user_freq,
350                          'timestamp': time.time()}
351        if DEBUG_LEVEL > 1:
352            LOGGER.debug('insert_sqlstr=%s', insert_sqlstr)
353            LOGGER.debug('insert_sqlargs=%s', insert_sqlargs)
354        try:
355            self.database.execute(insert_sqlstr, insert_sqlargs)
356            if commit:
357                self.database.commit()
358        except Exception:
359            LOGGER.exception('Unexpected error adding phrase to database.')
360
361    def create_indexes(self, commit: bool = True) -> None:
362        '''Create indexes for the database.'''
363        sqlstr = '''
364        CREATE INDEX IF NOT EXISTS user_db.phrases_index_p ON phrases
365        (input_phrase, id ASC);
366        CREATE INDEX IF NOT EXISTS user_db.phrases_index_i ON phrases
367        (phrase)
368        ;'''
369        self.database.executescript(sqlstr)
370        if commit:
371            self.database.commit()
372
373    def best_candidates(
374            self,
375            phrase_frequencies: Dict[str, float],
376            title=False) -> List[Tuple[str, float]]:
377        '''Sorts the phrase_frequencies dictionary and returns the best
378        candidates.
379
380        Should *not* change the phrase_frequencies dictionary!
381        '''
382        candidates = sorted(phrase_frequencies.items(),
383                            key=lambda x: (
384                                -1*x[1],   # user_freq descending
385                                len(x[0]), # len(phrase) ascending
386                                x[0]       # phrase alphabetical
387                            ))[:20]
388        if not title:
389            return candidates
390        candidates_title = []
391        phrases_title = set()
392        for candidate in candidates:
393            phrase = candidate[0]
394            phrase_title = phrase[:1].title() + phrase[1:]
395            if phrase_title in phrases_title:
396                continue
397            candidates_title.append((phrase_title, candidate[1]))
398            phrases_title.add(phrase_title)
399        return candidates_title
400
401    def select_words(
402            self,
403            input_phrase: str,
404            p_phrase: str = '',
405            pp_phrase: str = '') -> List[Tuple[str, float]]:
406        '''
407        Get phrases from database completing input_phrase.
408
409        Returns a list of matches where each match is a tuple in the
410        form of (phrase, user_freq), i.e. returns something like
411        [(phrase, user_freq), ...]
412        '''
413        input_phrase = unicodedata.normalize(
414            itb_util.NORMALIZATION_FORM_INTERNAL, input_phrase)
415        p_phrase = itb_util.remove_accents(p_phrase.lower())
416        pp_phrase = itb_util.remove_accents(pp_phrase.lower())
417        title_case = input_phrase.istitle()
418        if DEBUG_LEVEL > 1:
419            LOGGER.debug(
420                'input_phrase=%s p_phrase=%s pp_phrase=%s',
421                input_phrase, p_phrase, pp_phrase)
422        phrase_frequencies: Dict[str, float] = {}
423        if not ' ' in input_phrase:
424            # Get suggestions from hunspell dictionaries. But only
425            # if input_phrase does not contain spaces. The hunspell
426            # dictionaries contain only single words, not sentences.
427            # Trying to complete an input_phrase which contains spaces
428            # will never work and spell checking suggestions by hunspell
429            # for input which contains spaces is almost always nonsense.
430            phrase_frequencies.update([
431                x for x in self.hunspell_obj.suggest(input_phrase)])
432        if DEBUG_LEVEL > 1:
433            LOGGER.debug(
434                'hunspell: best_candidates=%s',
435                self.best_candidates(phrase_frequencies, title=title_case))
436        # Remove the accents *after* getting the hunspell candidates.
437        # If the accents were removed before getting the hunspell candidates
438        # an input phrase like “Glühwürmchen” would not be added as a
439        # candidate because hunspell would get “Gluhwurmchen” then and would
440        # not validate that as a correct word. And, because “Glühwürmchen”
441        # is not in the German hunspell dictionary as a single word but
442        # created by suffix and prefix rules, the accent insensitive match
443        # in the German hunspell dictionary would not find it either.
444        input_phrase = itb_util.remove_accents(input_phrase.lower())
445        # Now phrase_frequencies might contain something like this:
446        #
447        # {'code': 0, 'communicability': 0, 'cold': 0, 'colour': 0}
448
449        # To quote a string to be used as a parameter when assembling
450        # an sqlite statement with Python string operations, remove
451        # all NUL characters, replace " with "" and wrap the whole
452        # string in double quotes. Assembling sqlite statements using
453        # parameters containing user input with python string operations
454        # is not recommended because of the risk of SQL injection attacks
455        # if the quoting is not done the right way. So it is better to use
456        # the parameter substitution of the sqlite3 python interface.
457        # But unfortunately that does not work when creating views,
458        # (“OperationalError: parameters are not allowed in views”).
459        quoted_input_phrase = input_phrase.replace(
460            '\x00', '').replace('"', '""')
461        self.database.execute('DROP VIEW IF EXISTS like_input_phrase_view;')
462        sqlstr = '''
463        CREATE TEMPORARY VIEW IF NOT EXISTS like_input_phrase_view AS
464        SELECT * FROM user_db.phrases
465        WHERE input_phrase LIKE "%(quoted_input_phrase)s%%"
466        ;''' % {'quoted_input_phrase': quoted_input_phrase}
467        try:
468            self.database.execute(sqlstr)
469        except Exception:
470            LOGGER.exception('Unexpected error in creating database view.')
471        sqlargs = {'p_phrase': p_phrase, 'pp_phrase': pp_phrase}
472        sqlstr = (
473            'SELECT phrase, sum(user_freq) FROM like_input_phrase_view '
474            'GROUP BY phrase;')
475        try:
476            # Get “unigram” data from user_db.
477            #
478            # Example: Let’s assume the user typed “co” and user_db contains
479            #
480            #     1|colou|colour|green|nice|1
481            #     2|col|colour|yellow|ugly|2
482            #     3|co|colour|green|awesome|1
483            #     4|co|cold|||1
484            #     5|conspirac|conspiracy|||5
485            #     6|conspi|conspiracy|||1
486            #     7|c|conspiracy|||1
487            results_uni = self.database.execute(sqlstr, sqlargs).fetchall()
488            # Then the result returned by .fetchall() is:
489            #
490            # [('colour', 4), ('cold', 1), ('conspiracy', 6)]
491            #
492            # (“c|conspiracy|1” is not selected because it doesn’t
493            # match the user input “LIKE co%”! I.e. this is filtered
494            # out by the VIEW created above already)
495        except Exception:
496            LOGGER.exception(
497                'Unexpected error getting “unigram” data from user_db.')
498        if not results_uni:
499            # If no unigrams matched, bigrams and trigrams cannot
500            # match either. We can stop here and return what we got
501            # from hunspell.
502            return self.best_candidates(phrase_frequencies, title=title_case)
503        # Now normalize the unigram frequencies with the total count
504        # (which is 11 in the above example), which gives us the
505        # normalized result:
506        # [('colour', 4/11), ('cold', 1/11), ('conspiracy', 6/11)]
507        sqlstr = 'SELECT sum(user_freq) FROM like_input_phrase_view;'
508        try:
509            count = self.database.execute(sqlstr, sqlargs).fetchall()[0][0]
510        except Exception:
511            LOGGER.exception(
512                'Unexpected error getting total unigram count from user_db')
513        # Updating the phrase_frequency dictionary with the normalized
514        # results gives: {'conspiracy': 6/11, 'code': 0,
515        # 'communicability': 0, 'cold': 1/11, 'colour': 4/11}
516        for result_uni in results_uni:
517            phrase_frequencies.update(
518                [(result_uni[0], result_uni[1]/float(count))])
519        if DEBUG_LEVEL > 1:
520            LOGGER.debug(
521                'Unigram best_candidates=%s',
522                self.best_candidates(phrase_frequencies, title=title_case))
523        if not p_phrase:
524            # If no context for bigram matching is available, return
525            # what we have so far:
526            return self.best_candidates(phrase_frequencies, title=title_case)
527        sqlstr = (
528            'SELECT phrase, sum(user_freq) FROM like_input_phrase_view '
529            'WHERE p_phrase = :p_phrase GROUP BY phrase;')
530        try:
531            results_bi = self.database.execute(sqlstr, sqlargs).fetchall()
532        except Exception:
533            LOGGER.exception(
534                'Unexpected error getting “bigram” data from user_db')
535        if not results_bi:
536            # If no bigram could be matched, return what we have so far:
537            return self.best_candidates(phrase_frequencies, title=title_case)
538        # get the total count of p_phrase to normalize the bigram frequencies:
539        sqlstr = (
540            'SELECT sum(user_freq) FROM like_input_phrase_view '
541            'WHERE p_phrase = :p_phrase;')
542        try:
543            count_p_phrase = self.database.execute(
544                sqlstr, sqlargs).fetchall()[0][0]
545        except Exception:
546            LOGGER.exception(
547                'Unexpected error getting total bigram count from user_db')
548        # Update the phrase frequency dictionary by using a linear
549        # combination of the unigram and the bigram results, giving
550        # both the weight of 0.5:
551        for result_bi in results_bi:
552            phrase_frequencies.update(
553                [(result_bi[0],
554                  0.5*result_bi[1]/float(count_p_phrase)
555                  +0.5*phrase_frequencies[result_bi[0]])])
556        if DEBUG_LEVEL > 1:
557            LOGGER.debug(
558                'Bigram best_candidates=%s',
559                self.best_candidates(phrase_frequencies, title=title_case))
560        if not pp_phrase:
561            # If no context for trigram matching is available, return
562            # what we have so far:
563            return self.best_candidates(phrase_frequencies, title=title_case)
564        sqlstr = ('SELECT phrase, sum(user_freq) FROM like_input_phrase_view '
565                  'WHERE p_phrase = :p_phrase '
566                  'AND pp_phrase = :pp_phrase GROUP BY phrase;')
567        try:
568            results_tri = self.database.execute(sqlstr, sqlargs).fetchall()
569        except Exception:
570            LOGGER.exception(
571                'Unexpected error getting “trigram” data from user_db')
572        if not results_tri:
573            # if no trigram could be matched, return what we have so far:
574            return self.best_candidates(phrase_frequencies, title=title_case)
575        # get the total count of (p_phrase, pp_phrase) pairs to
576        # normalize the bigram frequencies:
577        sqlstr = (
578            'SELECT sum(user_freq) FROM like_input_phrase_view '
579            'WHERE p_phrase = :p_phrase AND pp_phrase = :pp_phrase;')
580        try:
581            count_pp_phrase_p_phrase = self.database.execute(
582                sqlstr, sqlargs).fetchall()[0][0]
583        except Exception:
584            LOGGER.exception(
585                'Unexpected error getting total trigram count from user_db')
586        # Update the phrase frequency dictionary by using a linear
587        # combination of the bigram and the trigram results, giving
588        # both the weight of 0.5 (that makes the total weights: 0.25 *
589        # unigram + 0.25 * bigram + 0.5 * trigram, i.e. the trigrams
590        # get higher weight):
591        for result_tri in results_tri:
592            phrase_frequencies.update(
593                [(result_tri[0],
594                  0.5*result_tri[1]/float(count_pp_phrase_p_phrase)
595                  +0.5*phrase_frequencies[result_tri[0]])])
596        if DEBUG_LEVEL > 1:
597            LOGGER.debug(
598                'Trigram best_candidates=%s',
599                self.best_candidates(phrase_frequencies, title=title_case))
600        return self.best_candidates(phrase_frequencies, title=title_case)
601
602    def generate_userdb_desc(self) -> None:
603        '''
604        Add a description table to the user database
605
606        This adds the database version and  the create time
607        '''
608        try:
609            sqlstring = ('CREATE TABLE IF NOT EXISTS user_db.desc '
610                         + '(name PRIMARY KEY, value);')
611            self.database.executescript(sqlstring)
612            sqlstring = 'INSERT OR IGNORE INTO user_db.desc  VALUES (?, ?);'
613            self.database.execute(
614                sqlstring, ('version', USER_DATABASE_VERSION))
615            sqlstring = (
616                'INSERT OR IGNORE INTO user_db.desc '
617                + 'VALUES (?, DATETIME("now", "localtime"));')
618            self.database.execute(sqlstring, ("create-time", ))
619            self.database.commit()
620        except Exception:
621            LOGGER.exception('Unexpected error adding description to user_db.')
622
623    def init_user_db(self) -> None:
624        '''
625        Initialize the user database unless it is an in-memory database
626        '''
627        if self.user_db_file == ':memory:':
628            return
629        if not path.exists(self.user_db_file):
630            self.database = sqlite3.connect(self.user_db_file)
631            self.set_database_pragma_options()
632
633    def set_database_pragma_options(self) -> None:
634        '''Set options for the user database'''
635        # a database containing the complete German Hunspell
636        # dictionary has less then 6000 pages. 20000 pages
637        # should be enough to cache the complete database
638        # in most cases.
639        self.database.executescript('''
640        PRAGMA encoding = "UTF-8";
641        PRAGMA case_sensitive_like = true;
642        PRAGMA page_size = 4096;
643        PRAGMA cache_size = 20000;
644        PRAGMA temp_store = MEMORY;
645        PRAGMA journal_mode = WAL;
646        PRAGMA journal_size_limit = 1000000;
647        PRAGMA synchronous = NORMAL;
648        PRAGMA auto_vacuum = FULL;
649        ''')
650        self.database.commit()
651
652    def get_database_desc(self, db_file: str) -> Optional[Dict[str, str]]:
653        '''Get the description of the database'''
654        if not path.exists(db_file):
655            return None
656        try:
657            database = sqlite3.connect(db_file)
658            desc = {}
659            for row in database.execute("SELECT * FROM desc;").fetchall():
660                desc[row[0]] = row[1]
661            database.close()
662            return desc
663        except Exception:
664            LOGGER.exception('Unexpected error getting database description.')
665            return None
666
667    def get_number_of_columns_of_phrase_table(
668            self, db_file: str) -> Optional[int]:
669        # pylint: disable=line-too-long
670        '''
671        Get the number of columns in the 'phrases' table in
672        the database in db_file.
673
674        Determines the number of columns by parsing this:
675
676        sqlite> select sql from sqlite_master where name='phrases';
677CREATE TABLE phrases (id INTEGER PRIMARY KEY, input_phrase TEXT, phrase TEXT, p_phrase TEXT, pp_phrase TEXT, user_freq INTEGER, timestamp REAL)
678        sqlite>
679
680        This result could be on a single line, as above, or on multiple
681        lines.
682        '''
683        # pylint: enable=line-too-long
684        if not path.exists(db_file):
685            return None
686        try:
687            database = sqlite3.connect(db_file)
688            table_phrases_result = database.execute(
689                "select sql from sqlite_master where name='phrases';"
690            ).fetchall()
691            # Remove possible line breaks from the string where we
692            # want to match:
693            string = ' '.join(table_phrases_result[0][0].splitlines())
694            match = re.match(r'.*\((.*)\)', string)
695            if match:
696                table_phrases_columns = match.group(1).split(',')
697                return len(table_phrases_columns)
698            return 0
699        except Exception:
700            LOGGER.exception(
701                'Unexpected error getting number of columns of database.')
702            return 0
703
704    def list_user_shortcuts(self) -> List[Tuple[str, str]]:
705        '''Returns a list of user defined shortcuts from the user database.
706        '''
707        sqlstr = '''
708        SELECT input_phrase, phrase FROM user_db.phrases WHERE user_freq >= :freq
709        ;'''
710        sqlargs = {'freq': itb_util.SHORTCUT_USER_FREQ}
711        if DEBUG_LEVEL > 1:
712            LOGGER.debug('sqlstr=%s', sqlstr)
713            LOGGER.debug('sqlargs=%s', sqlargs)
714        result = self.database.execute(sqlstr, sqlargs).fetchall()
715        if DEBUG_LEVEL > 1:
716            LOGGER.debug('result=%s', result)
717        return result
718
719    def check_phrase_and_update_frequency(
720            self,
721            input_phrase: str = '',
722            phrase: str = '',
723            p_phrase: str = '',
724            pp_phrase: str = '',
725            user_freq_increment: int = 1,
726            commit: bool = True) -> None:
727        '''
728        Check whether input_phrase and phrase are already in database. If
729        they are in the database, increase the frequency by 1, if not
730        add them.
731        '''
732        if not input_phrase:
733            input_phrase = phrase
734        if not phrase:
735            return
736        phrase = unicodedata.normalize(
737            itb_util.NORMALIZATION_FORM_INTERNAL, phrase)
738        p_phrase = itb_util.remove_accents(p_phrase.lower())
739        pp_phrase = itb_util.remove_accents(pp_phrase.lower())
740        input_phrase = itb_util.remove_accents(input_phrase.lower())
741
742        if DEBUG_LEVEL > 1:
743            LOGGER.debug(
744                'phrase=%(p)s, input_phrase=%(t)s',
745                {'p': phrase, 't': input_phrase})
746
747        # There should never be more than 1 database row for the same
748        # input_phrase *and* phrase. So the following query on
749        # the database should match at most one database
750        # row and the length of the result array should be 0 or
751        # 1. So the “GROUP BY phrase” is actually redundant. It is
752        # only a safeguard for the case when duplicate rows have been
753        # added to the database accidentally (But in that case there
754        # is a bug somewhere else which should be fixed).
755        sqlstr = '''
756        SELECT max(user_freq) FROM user_db.phrases
757        WHERE input_phrase = :input_phrase
758        AND phrase = :phrase AND p_phrase = :p_phrase AND pp_phrase = :pp_phrase
759        GROUP BY phrase
760        ;'''
761        sqlargs = {'input_phrase': input_phrase,
762                   'phrase': phrase,
763                   'p_phrase': p_phrase,
764                   'pp_phrase': pp_phrase}
765        if DEBUG_LEVEL > 1:
766            LOGGER.debug(
767                'TabSqliteDb.check_phrase_and_update_frequency() sqlstr=%s',
768                sqlstr)
769            LOGGER.debug(
770                'TabSqliteDb.check_phrase_and_update_frequency() sqlargs=%s',
771                sqlargs)
772        result = self.database.execute(sqlstr, sqlargs).fetchall()
773        if DEBUG_LEVEL > 1:
774            LOGGER.debug(
775                'check_phrase_and_update_frequency() result=%s', result)
776        if result:
777            # A match was found in user_db, increase user frequency by
778            # user_freq_increment (1 by default)
779            self.update_phrase(input_phrase=input_phrase,
780                               phrase=phrase,
781                               p_phrase=p_phrase,
782                               pp_phrase=pp_phrase,
783                               user_freq=result[0][0]+user_freq_increment,
784                               commit=commit)
785            return
786        # The phrase was not found in user_db.
787        # Add it as a new phrase, i.e. with user_freq = user_freq_increment
788        # (1 by default):
789        self.add_phrase(input_phrase=input_phrase,
790                        phrase=phrase,
791                        p_phrase=p_phrase,
792                        pp_phrase=pp_phrase,
793                        user_freq=user_freq_increment,
794                        commit=commit)
795        return
796
797    def remove_phrase(
798            self,
799            input_phrase: str = '',
800            phrase: str = '',
801            commit: bool = True) -> None:
802        '''
803        Remove all rows matching “input_phrase” and “phrase” from database.
804        Or, if “input_phrase” is “None”, remove all rows matching “phrase”
805        no matter for what input phrase from the database.
806        '''
807        if DEBUG_LEVEL > 1:
808            LOGGER.debug(
809                'TabSqliteDb.remove_phrase() phrase=%(p)s',
810                {'p': phrase.encode('UTF-8')})
811        if not phrase:
812            return
813        phrase = unicodedata.normalize(
814            itb_util.NORMALIZATION_FORM_INTERNAL, phrase)
815        if input_phrase:
816            input_phrase = unicodedata.normalize(
817                itb_util.NORMALIZATION_FORM_INTERNAL, input_phrase)
818        if input_phrase:
819            delete_sqlstr = '''
820            DELETE FROM user_db.phrases
821            WHERE input_phrase = :input_phrase AND phrase = :phrase
822            ;'''
823        else:
824            delete_sqlstr = '''
825            DELETE FROM user_db.phrases
826            WHERE phrase = :phrase
827            ;'''
828        delete_sqlargs = {'input_phrase': input_phrase, 'phrase': phrase}
829        self.database.execute(delete_sqlstr, delete_sqlargs)
830        if commit:
831            self.database.commit()
832
833    def extract_user_phrases(self) -> List[Tuple[str, int]]:
834        '''extract user phrases from database'''
835        try:
836            database = sqlite3.connect(self.user_db_file)
837            database.execute('PRAGMA wal_checkpoint;')
838            phrases = database.execute(
839                'SELECT phrase, sum(user_freq) FROM phrases GROUP BY phrase;'
840            ).fetchall()
841            database.close()
842            phrases = [
843                (unicodedata.normalize(
844                    itb_util.NORMALIZATION_FORM_INTERNAL, x[0]), x[1])
845                for x in
846                phrases
847            ]
848            return phrases[:]
849        except Exception:
850            LOGGER.exception('Unexpected error extracting user phrases.')
851            return []
852
853    def read_training_data_from_file(self, filename: str) -> bool:
854        '''
855        Read data to train the prediction from a text file.
856
857        :param filename: Full path of the text file to read.
858        '''
859        if not os.path.isfile(filename):
860            filename += '.gz'
861            if not os.path.isfile(filename):
862                return False
863        open_function: Callable = open
864        if filename.endswith('.gz'):
865            open_function = gzip.open
866        rows = self.database.execute(
867            'SELECT input_phrase, phrase, p_phrase, pp_phrase, '
868            + 'user_freq, timestamp FROM phrases;').fetchall()
869        rows = sorted(rows, key = lambda x: (x[5])) # sort by timestamp
870        time_min = time.time()
871        time_max = time.time()
872        if rows:
873            time_min = rows[0][5]
874            time_max = rows[-1][5]
875        # timestamp for added entries (timestamp of existing entries is kept):
876        time_new = time_min + 0.20 * (time_max - time_min)
877        LOGGER.info('Minimum timestamp in the database=%s',
878                    time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(time_min)))
879        LOGGER.info('Maximum timestamp in the database=%s',
880                    time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(time_max)))
881        LOGGER.info('New timestamp in the database=%s',
882                    time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(time_new)))
883        p_token = ''
884        pp_token = ''
885        database_dict: Dict[Tuple[str, str, str, str], Dict[str, Any]] = {}
886        for row in rows:
887            # itb_util.remove_accents() returns in NORMALIZATION_FORM_INTERNAL.
888            # row[1] (“phrase”) should already be in NORMALIZATION_FORM_INTERNAL
889            # but better convert it again here just to make sure.
890            input_phrase = itb_util.remove_accents(row[0].lower())
891            phrase = unicodedata.normalize(
892                itb_util.NORMALIZATION_FORM_INTERNAL, row[1])
893            p_phrase = itb_util.remove_accents(row[2].lower())
894            pp_phrase = itb_util.remove_accents(row[3].lower())
895            database_dict.update([((row[0], row[1], row[2], row[3]),
896                                   {'input_phrase': input_phrase,
897                                    'phrase': phrase,
898                                    'p_phrase': p_phrase,
899                                    'pp_phrase': pp_phrase,
900                                    'user_freq': row[4],
901                                    'timestamp': row[5]}
902                                  )])
903        lines = []
904        try:
905            with open_function(
906                    filename, mode='rt', encoding='UTF-8') as file_handle:
907                lines = [
908                    unicodedata.normalize(
909                        itb_util.NORMALIZATION_FORM_INTERNAL, line)
910                    for line in file_handle.readlines()]
911        except Exception:
912            LOGGER.exception(
913                'Unexpected error reading training data from file.')
914            return False
915        for line in lines:
916            for token in itb_util.tokenize(line):
917                key = (itb_util.remove_accents(token.lower()),
918                       token,
919                       itb_util.remove_accents(p_token.lower()),
920                       itb_util.remove_accents(pp_token.lower()))
921                if key in database_dict:
922                    database_dict[key]['user_freq'] += 1
923                else:
924                    database_dict[key] = {
925                        'input_phrase': itb_util.remove_accents(token.lower()),
926                        'phrase': token,
927                        'p_phrase': itb_util.remove_accents(p_token.lower()),
928                        'pp_phrase': itb_util.remove_accents(pp_token.lower()),
929                        'user_freq': 1,
930                        'timestamp': time_new}
931                pp_token = p_token
932                p_token = token
933        sqlargs = []
934        for key in database_dict:
935            sqlargs.append(database_dict[key])
936        sqlstr = '''
937        INSERT INTO user_db.phrases (input_phrase, phrase, p_phrase, pp_phrase, user_freq, timestamp)
938        VALUES (:input_phrase, :phrase, :p_phrase, :pp_phrase, :user_freq, :timestamp)
939        ;'''
940        try:
941            self.database.execute('DELETE FROM phrases;')
942            # Without the following commit, the
943            # self.database.executemany() fails with
944            # “OperationalError: database is locked”.
945            self.database.commit()
946            self.database.executemany(sqlstr, sqlargs)
947            self.database.commit()
948            self.database.execute('PRAGMA wal_checkpoint;')
949        except Exception:
950            LOGGER.exception(
951                'Unexpected error writing training data to database.')
952            return False
953        return True
954
955    def remove_all_phrases(self) -> None:
956        '''
957        Remove all phrases from the database, i.e. delete all the
958        data learned from user input or text files.
959        '''
960        try:
961            self.database.execute('DELETE FROM phrases;')
962            self.database.commit()
963            self.database.execute('PRAGMA wal_checkpoint;')
964        except Exception:
965            LOGGER.exception(
966                'Unexpected error removing all phrases from database.')
967
968    def dump_database(self) -> None:
969        '''
970        Dump the contents of the database to the log
971
972        (For debugging)
973        '''
974        try:
975            LOGGER.debug('SELECT * FROM desc;\n')
976            for row in self.database.execute("SELECT * FROM desc;").fetchall():
977                LOGGER.debug('%s', repr(row))
978            LOGGER.debug('SELECT * FROM phrases;\n')
979            for row in self.database.execute(
980                    "SELECT * FROM phrases;").fetchall():
981                LOGGER.debug('%s', repr(row))
982        except Exception:
983            LOGGER.exception('Unexpected error dumping database.')
984            return
985
986    def number_of_rows_in_database(self) -> int:
987        '''
988        Return the current number of rows in the database
989
990        (For debugging)
991        '''
992        try:
993            return len(self.database.execute(
994                "SELECT * FROM phrases;").fetchall())
995        except Exception:
996            LOGGER.exception('Unexpected error getting number of database rows.')
997            return -1
998
999    def cleanup_database(self, thread: bool = True) -> None:
1000        '''Cleanup user database by expiring entries which have not been
1001        used for a long time.
1002
1003        :param thread: Whether this function is called in a different thread or not.
1004
1005        Usually it is called in a thread started when Typing Booster
1006        has just finished starting up and is ready for input.
1007
1008        But in unittest test cases it may be called with an in memory
1009        database without starting a thread. In that case a new
1010        database connection is not needed.
1011
1012        '''
1013        if thread and self.user_db_file == ':memory:':
1014            LOGGER.info('Database cleanup not needed for memory database.')
1015            return
1016        LOGGER.info('Database cleanup starting ...')
1017        time_now = time.time()
1018         # id, input_phrase, phrase, p_phrase, pp_phrase, user_freq, timestamp
1019        rows: List[Tuple[int, str, str, str, str, int, float]] = []
1020        try:
1021            if thread:
1022                # SQLite objects created in a thread can only be used in
1023                # that same thread.  As the database cleanup is usually
1024                # called in a separate thread, get a new connection:
1025                database = sqlite3.connect(self.user_db_file)
1026            else:
1027                database = self.database
1028            rows = database.execute("SELECT * FROM phrases;").fetchall()
1029        except Exception:
1030            LOGGER.exception('Exception when accessing database')
1031            import traceback
1032            traceback.print_exc()
1033        if not rows:
1034            return
1035        rows = sorted(rows,
1036                      key = lambda x: (
1037                          x[5], # user_freq
1038                          x[6], # timestamp
1039                          x[0], # id
1040                      ))
1041        LOGGER.info('Total number of database rows to check=%s ', len(rows))
1042        index = len(rows)
1043        max_rows = 50000
1044        number_delete_above_max = 0
1045        rows_kept: List[Tuple[int, str, str, str, str, int, float]] = []
1046        for row in rows:
1047            user_freq = row[5]
1048            if (index > max_rows
1049                and user_freq < itb_util.SHORTCUT_USER_FREQ):
1050                LOGGER.info('1st pass: deleting %s %s',
1051                            repr(row),
1052                            time.strftime("%Y-%m-%d %H:%M:%S",
1053                                          time.gmtime(row[6])))
1054                number_delete_above_max += 1
1055                sqlstr_delete = 'DELETE from phrases WHERE id = :id;'
1056                sqlargs_delete = {'id': row[0]}
1057                try:
1058                    database.execute(sqlstr_delete, sqlargs_delete)
1059                except Exception:
1060                    LOGGER.exception(
1061                        '1st pass: exception deleting row from database.')
1062                    import traceback
1063                    traceback.print_exc()
1064            else:
1065                rows_kept.append(row)
1066            index -= 1
1067        LOGGER.info('1st pass: Number of rows deleted above maximum size='
1068                    f'{number_delete_above_max}')
1069        # As the first pass above removes rows sorted by count and
1070        # then by timestamp, it will never remove rows with a higher
1071        # count even if they are extremely old. Therefore, a second
1072        # pass uses sorting only by timestamp in order to first decay and
1073        # eventually remove some rows which have not been used for a
1074        # long time as well, even if they have a higher count.
1075        # In this second pass, the 0.1% oldest rows are checked
1076        # and:
1077        #
1078        # - if user_freq == 1 remove the row
1079        # - if user_freq > 1 divide user_freq by 2 and update timestamp to “now”
1080        #
1081        # 0.1% is really not much but I want to be careful not to remove
1082        # too much when trying this out.
1083        #
1084        # sort kept rows by timestamp only instead of user_freq and timestamp:
1085        rows_kept = sorted(rows_kept,
1086                           key = lambda x: (
1087                               x[6], # timestamp
1088                               x[0], # id
1089                           ))
1090        index = len(rows_kept)
1091        LOGGER.info('1st pass: Number of rows kept=%s', index)
1092        index_decay = int(max_rows * 0.999)
1093        LOGGER.info('2nd pass: Index for decay=%s', index_decay)
1094        number_of_rows_to_decay = 0
1095        number_of_rows_to_delete = 0
1096        for row in rows_kept:
1097            user_freq = row[5]
1098            if (index > index_decay
1099                and user_freq < itb_util.SHORTCUT_USER_FREQ):
1100                if user_freq == 1:
1101                    LOGGER.info('2nd pass: deleting %s %s',
1102                                repr(row),
1103                                time.strftime("%Y-%m-%d %H:%M:%S",
1104                                              time.gmtime(row[6])))
1105                    number_of_rows_to_delete += 1
1106                    sqlstr_delete = 'DELETE from phrases WHERE id = :id;'
1107                    sqlargs_delete = {'id': row[0]}
1108                    try:
1109                        database.execute(sqlstr_delete, sqlargs_delete)
1110                    except Exception:
1111                        LOGGER.exception(
1112                            '2nd pass: exception deleting row from database.')
1113                        import traceback
1114                        traceback.print_exc()
1115                else:
1116                    LOGGER.info('2nd pass: decaying %s %s',
1117                                repr(row),
1118                                time.strftime("%Y-%m-%d %H:%M:%S",
1119                                              time.gmtime(row[6])))
1120                    number_of_rows_to_decay += 1
1121                    sqlstr_update = '''
1122                    UPDATE phrases
1123                    SET user_freq = :user_freq, timestamp = :timestamp
1124                    WHERE id = :id
1125                    ;'''
1126                    sqlargs_update = {'id': row[0],
1127                                      'user_freq': int(user_freq/2),
1128                                      'timestamp': time.time()}
1129
1130                    try:
1131                        database.execute(sqlstr_update, sqlargs_update)
1132                    except Exception:
1133                        LOGGER.exception(
1134                            '2nd pass: exception decaying row from database.')
1135                        import traceback
1136                        traceback.print_exc()
1137            index -= 1
1138        LOGGER.info('Commit database and execute checkpoint ...')
1139        database.commit()
1140        database.execute('PRAGMA wal_checkpoint;')
1141        LOGGER.info('Rebuild database using VACUUM command ...')
1142        database.execute('VACUUM;')
1143        LOGGER.info('Number of database rows deleted=%s',
1144                     number_delete_above_max + number_of_rows_to_delete)
1145        LOGGER.info('Number of database rows decayed=%s',
1146                    number_of_rows_to_decay)
1147        LOGGER.info('Number of rows before cleanup=%s', len(rows))
1148        LOGGER.info('Number of rows remaining=%s',
1149                    len(rows_kept) - number_of_rows_to_delete)
1150        LOGGER.info('Time for database cleanup=%s seconds',
1151                    time.time() - time_now)
1152        LOGGER.info('Database cleanup finished.')
1153