1# -*- coding: utf-8 -*- 2# vim:et sts=4 sw=4 3# 4# ibus-typing-booster - A completion input method for IBus 5# 6# Copyright (c) 2011-2013 Anish Patil <apatil@redhat.com> 7# Copyright (c) 2012-2018 Mike FABIAN <mfabian@redhat.com> 8# 9# This program is free software: you can redistribute it and/or modify 10# it under the terms of the GNU General Public License as published by 11# the Free Software Foundation, either version 3 of the License, or 12# (at your option) any later version. 13# This program is distributed in the hope that it will be useful, 14# but WITHOUT ANY WARRANTY; without even the implied warranty of 15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16# GNU General Public License for more details. 17# You should have received a copy of the GNU General Public License 18# along with this program. If not, see <http://www.gnu.org/licenses/> 19''' 20Module for ibus-typing-booster to access the sqlite3 databases 21''' 22 23from typing import Dict 24from typing import List 25from typing import Tuple 26from typing import Optional 27from typing import Any 28from typing import Callable 29import os 30import os.path as path 31import codecs 32import unicodedata 33import sqlite3 34import time 35import re 36import gzip 37import logging 38import itb_util 39import hunspell_suggest 40 41LOGGER = logging.getLogger('ibus-typing-booster') 42 43DEBUG_LEVEL = int(0) 44 45USER_DATABASE_VERSION = '0.65' 46 47class TabSqliteDb: 48 # pylint: disable=line-too-long 49 '''Phrase databases for ibus-typing-booster 50 51 The phrases table in the database has columns with the names: 52 53 “id”, “input_phrase”, “phrase”, “p_phrase”, “pp_phrase”, “user_freq”, “timestamp” 54 55 There are 2 databases, sysdb, userdb. 56 57 sysdb: “Database” with the suggestions from the hunspell dictionaries 58 user_freq = 0 always. 59 60 Actually there is no Sqlite3 database called “sysdb”, these 61 are the suggestions coming from hunspell_suggest, i.e. from 62 grepping the hunspell dictionaries and from pyhunspell. 63 (Historic note: ibus-typing-booster started as a fork of 64 ibus-table, in ibus-table “sysdb” is a Sqlite3 database 65 which is installed systemwide and readonly for the user) 66 67 user_db: Database on disk where the phrases learned from the user are stored 68 user_freq >= 1: The number of times the user has used this phrase 69 ''' 70 # pylint: enable=line-too-long 71 def __init__(self, user_db_file='') -> None: 72 global DEBUG_LEVEL 73 try: 74 DEBUG_LEVEL = int(str(os.getenv('IBUS_TYPING_BOOSTER_DEBUG_LEVEL'))) 75 except (TypeError, ValueError): 76 DEBUG_LEVEL = int(0) 77 if DEBUG_LEVEL > 1: 78 LOGGER.debug( 79 'TabSqliteDb.__init__(user_db_file = %s)', user_db_file) 80 self.user_db_file = user_db_file 81 if not self.user_db_file and os.getenv('HOME'): 82 self.user_db_file = path.join( 83 str(os.getenv('HOME')), '.local/share/ibus-typing-booster/user.db') 84 if not self.user_db_file: 85 LOGGER.debug('Falling back to ":memory:" for user.db') 86 self.user_db_file = ':memory:' 87 if (self.user_db_file != ':memory:' 88 and not os.path.isdir(os.path.dirname(self.user_db_file))): 89 os.makedirs(os.path.dirname(self.user_db_file), exist_ok=True) 90 self._phrase_table_column_names = [ 91 'id', 92 'input_phrase', 93 'phrase', 94 'p_phrase', 95 'pp_phrase', 96 'user_freq', 97 'timestamp'] 98 99 self.old_phrases = [] 100 101 self.hunspell_obj = hunspell_suggest.Hunspell(()) 102 103 if self.user_db_file != ':memory:': 104 if not os.path.exists(self.user_db_file): 105 LOGGER.info( 106 'The user database %(udb)s does not exist yet.', 107 {'udb': self.user_db_file}) 108 else: 109 try: 110 desc = self.get_database_desc(self.user_db_file) 111 if (desc is None 112 or desc["version"] != USER_DATABASE_VERSION 113 or (self.get_number_of_columns_of_phrase_table( 114 self.user_db_file) 115 != len(self._phrase_table_column_names))): 116 LOGGER.info( 117 'The user database %(udb)s seems incompatible', 118 {'udb': self.user_db_file}) 119 if desc is None: 120 LOGGER.info( 121 'No version information in the database') 122 elif desc["version"] != USER_DATABASE_VERSION: 123 LOGGER.info( 124 'The version of the database does not match ' 125 '(too old or too new?)') 126 LOGGER.info( 127 'ibus-typing-booster wants version=%s', 128 USER_DATABASE_VERSION) 129 LOGGER.info( 130 'But the database actually has version=%s', 131 desc["version"]) 132 elif (self.get_number_of_columns_of_phrase_table( 133 self.user_db_file) 134 != len(self._phrase_table_column_names)): 135 LOGGER.info( 136 'The number of columns of the database ' 137 'does not match') 138 LOGGER.info( 139 'ibus-typing-booster expects %(col)s columns', 140 {'col': len(self._phrase_table_column_names)}) 141 LOGGER.info( 142 'The database actually has %(col)s columns', 143 {'col': 144 self.get_number_of_columns_of_phrase_table( 145 self.user_db_file)}) 146 LOGGER.info( 147 'Trying to recover the phrases from the old, ' 148 'incompatible database') 149 self.old_phrases = self.extract_user_phrases() 150 timestamp = time.strftime('-%Y-%m-%d_%H:%M:%S') 151 LOGGER.info( 152 'Renaming the incompatible database to "%(name)s"', 153 {'name': self.user_db_file+timestamp}) 154 if os.path.exists(self.user_db_file): 155 os.rename(self.user_db_file, 156 self.user_db_file+timestamp) 157 if os.path.exists(self.user_db_file+'-shm'): 158 os.rename(self.user_db_file+'-shm', 159 self.user_db_file+'-shm'+timestamp) 160 if os.path.exists(self.user_db_file+'-wal'): 161 os.rename(self.user_db_file+'-wal', 162 self.user_db_file+'-wal'+timestamp) 163 LOGGER.info( 164 'Creating a new, empty database "%(name)s".', 165 {'name': self.user_db_file}) 166 self.init_user_db() 167 LOGGER.info( 168 'If user phrases were successfully recovered ' 169 'from the old, ' 170 'incompatible database, they will be used to ' 171 'initialize the new database.') 172 else: 173 LOGGER.info( 174 'Compatible database %(db)s found.', 175 {'db': self.user_db_file}) 176 except Exception: 177 LOGGER.exception( 178 'Unexpected error trying to find user database.') 179 180 # open user phrase database 181 try: 182 LOGGER.info( 183 'Connect to the database %(name)s.', 184 {'name': self.user_db_file}) 185 self.database = sqlite3.connect(self.user_db_file) 186 self.set_database_pragma_options() 187 self.database.executescript( 188 'ATTACH DATABASE "%s" AS user_db;' % self.user_db_file) 189 except Exception: 190 LOGGER.exception( 191 'Could not open the database %(name)s.', 192 {'name': self.user_db_file}) 193 timestamp = time.strftime('-%Y-%m-%d_%H:%M:%S') 194 LOGGER.info( 195 'Renaming the incompatible database to "%(name)s".', 196 {'name': self.user_db_file+timestamp}) 197 if os.path.exists(self.user_db_file): 198 os.rename(self.user_db_file, self.user_db_file+timestamp) 199 if os.path.exists(self.user_db_file+'-shm'): 200 os.rename(self.user_db_file+'-shm', 201 self.user_db_file+'-shm'+timestamp) 202 if os.path.exists(self.user_db_file+'-wal'): 203 os.rename(self.user_db_file+'-wal', 204 self.user_db_file+'-wal'+timestamp) 205 LOGGER.info( 206 'Creating a new, empty database "%(name)s".', 207 {'name': self.user_db_file}) 208 self.init_user_db() 209 self.database = sqlite3.connect(self.user_db_file) 210 self.set_database_pragma_options() 211 self.database.executescript( 212 'ATTACH DATABASE "%s" AS user_db;' % self.user_db_file) 213 self.create_tables() 214 if self.old_phrases: 215 sqlargs = [] 216 for ophrase in self.old_phrases: 217 sqlargs.append( 218 {'input_phrase': ophrase[0], 219 'phrase': ophrase[0], 220 'p_phrase': '', 221 'pp_phrase': '', 222 'user_freq': ophrase[1], 223 'timestamp': time.time()}) 224 sqlstr = ''' 225 INSERT INTO user_db.phrases (input_phrase, phrase, p_phrase, pp_phrase, user_freq, timestamp) 226 VALUES (:input_phrase, :phrase, :p_phrase, :pp_phrase, :user_freq, :timestamp) 227 ;''' 228 try: 229 self.database.executemany(sqlstr, sqlargs) 230 except Exception: 231 LOGGER.exception( 232 'Unexpected error inserting old phrases ' 233 'into the user database.') 234 self.database.commit() 235 self.database.execute('PRAGMA wal_checkpoint;') 236 237 # try create all hunspell-tables in user database 238 self.create_indexes(commit=False) 239 self.generate_userdb_desc() 240 241 def update_phrase( 242 self, 243 input_phrase: str = '', 244 phrase: str = '', 245 p_phrase: str = '', 246 pp_phrase: str = '', 247 user_freq: int = 0, 248 commit: bool = True) -> None: 249 ''' 250 update the user frequency of a phrase 251 ''' 252 if not input_phrase or not phrase: 253 return 254 input_phrase = itb_util.remove_accents(input_phrase.lower()) 255 phrase = unicodedata.normalize( 256 itb_util.NORMALIZATION_FORM_INTERNAL, phrase) 257 p_phrase = itb_util.remove_accents(p_phrase.lower()) 258 pp_phrase = itb_util.remove_accents(pp_phrase.lower()) 259 sqlstr = ''' 260 UPDATE user_db.phrases 261 SET user_freq = :user_freq, timestamp = :timestamp 262 WHERE input_phrase = :input_phrase 263 AND phrase = :phrase AND p_phrase = :p_phrase AND pp_phrase = :pp_phrase 264 ;''' 265 sqlargs = {'user_freq': user_freq, 266 'input_phrase': input_phrase, 267 'phrase': phrase, 268 'p_phrase': p_phrase, 269 'pp_phrase': pp_phrase, 270 'timestamp': time.time()} 271 if DEBUG_LEVEL > 1: 272 LOGGER.debug('sqlstr=%s', sqlstr) 273 LOGGER.debug('sqlargs=%s', sqlargs) 274 try: 275 self.database.execute(sqlstr, sqlargs) 276 if commit: 277 self.database.commit() 278 except Exception: 279 LOGGER.exception('Unexpected error updating phrase in user_db.') 280 281 def sync_usrdb(self) -> None: 282 ''' 283 Trigger a checkpoint operation. 284 ''' 285 if DEBUG_LEVEL > 1: 286 LOGGER.debug('commit and execute checkpoint ...') 287 self.database.commit() 288 self.database.execute('PRAGMA wal_checkpoint;') 289 if DEBUG_LEVEL > 1: 290 LOGGER.debug('commit and execute checkpoint done.') 291 292 def create_tables(self) -> None: 293 '''Create table for the phrases.''' 294 sqlstr = '''CREATE TABLE IF NOT EXISTS user_db.phrases 295 (id INTEGER PRIMARY KEY, 296 input_phrase TEXT, phrase TEXT, p_phrase TEXT, pp_phrase TEXT, 297 user_freq INTEGER, timestamp REAL);''' 298 self.database.execute(sqlstr) 299 self.database.commit() 300 301 def add_phrase( 302 self, 303 input_phrase: str = '', 304 phrase: str = '', 305 p_phrase: str = '', 306 pp_phrase: str = '', 307 user_freq: int = 0, 308 commit: bool = True) -> None: 309 ''' 310 Add phrase to database 311 ''' 312 if DEBUG_LEVEL > 1: 313 LOGGER.debug( 314 'input_phrase=%s phrase=%s user_freq=%s ', 315 input_phrase.encode('UTF-8'), 316 phrase.encode('UTF-8'), 317 user_freq) 318 if not input_phrase or not phrase: 319 return 320 input_phrase = itb_util.remove_accents(input_phrase.lower()) 321 phrase = unicodedata.normalize( 322 itb_util.NORMALIZATION_FORM_INTERNAL, phrase) 323 p_phrase = itb_util.remove_accents(p_phrase.lower()) 324 pp_phrase = itb_util.remove_accents(pp_phrase.lower()) 325 select_sqlstr = ''' 326 SELECT * FROM user_db.phrases 327 WHERE input_phrase = :input_phrase 328 AND phrase = :phrase AND p_phrase = :p_phrase AND pp_phrase = :pp_phrase 329 ;''' 330 select_sqlargs = { 331 'input_phrase': input_phrase, 332 'phrase': phrase, 333 'p_phrase': p_phrase, 334 'pp_phrase': pp_phrase} 335 if self.database.execute(select_sqlstr, select_sqlargs).fetchall(): 336 # there is already such a phrase, i.e. add_phrase was called 337 # in error, do nothing to avoid duplicate entries. 338 return 339 340 insert_sqlstr = ''' 341 INSERT INTO user_db.phrases 342 (input_phrase, phrase, p_phrase, pp_phrase, user_freq, timestamp) 343 VALUES (:input_phrase, :phrase, :p_phrase, :pp_phrase, :user_freq, :timestamp) 344 ;''' 345 insert_sqlargs = {'input_phrase': input_phrase, 346 'phrase': phrase, 347 'p_phrase': p_phrase, 348 'pp_phrase': pp_phrase, 349 'user_freq': user_freq, 350 'timestamp': time.time()} 351 if DEBUG_LEVEL > 1: 352 LOGGER.debug('insert_sqlstr=%s', insert_sqlstr) 353 LOGGER.debug('insert_sqlargs=%s', insert_sqlargs) 354 try: 355 self.database.execute(insert_sqlstr, insert_sqlargs) 356 if commit: 357 self.database.commit() 358 except Exception: 359 LOGGER.exception('Unexpected error adding phrase to database.') 360 361 def create_indexes(self, commit: bool = True) -> None: 362 '''Create indexes for the database.''' 363 sqlstr = ''' 364 CREATE INDEX IF NOT EXISTS user_db.phrases_index_p ON phrases 365 (input_phrase, id ASC); 366 CREATE INDEX IF NOT EXISTS user_db.phrases_index_i ON phrases 367 (phrase) 368 ;''' 369 self.database.executescript(sqlstr) 370 if commit: 371 self.database.commit() 372 373 def best_candidates( 374 self, 375 phrase_frequencies: Dict[str, float], 376 title=False) -> List[Tuple[str, float]]: 377 '''Sorts the phrase_frequencies dictionary and returns the best 378 candidates. 379 380 Should *not* change the phrase_frequencies dictionary! 381 ''' 382 candidates = sorted(phrase_frequencies.items(), 383 key=lambda x: ( 384 -1*x[1], # user_freq descending 385 len(x[0]), # len(phrase) ascending 386 x[0] # phrase alphabetical 387 ))[:20] 388 if not title: 389 return candidates 390 candidates_title = [] 391 phrases_title = set() 392 for candidate in candidates: 393 phrase = candidate[0] 394 phrase_title = phrase[:1].title() + phrase[1:] 395 if phrase_title in phrases_title: 396 continue 397 candidates_title.append((phrase_title, candidate[1])) 398 phrases_title.add(phrase_title) 399 return candidates_title 400 401 def select_words( 402 self, 403 input_phrase: str, 404 p_phrase: str = '', 405 pp_phrase: str = '') -> List[Tuple[str, float]]: 406 ''' 407 Get phrases from database completing input_phrase. 408 409 Returns a list of matches where each match is a tuple in the 410 form of (phrase, user_freq), i.e. returns something like 411 [(phrase, user_freq), ...] 412 ''' 413 input_phrase = unicodedata.normalize( 414 itb_util.NORMALIZATION_FORM_INTERNAL, input_phrase) 415 p_phrase = itb_util.remove_accents(p_phrase.lower()) 416 pp_phrase = itb_util.remove_accents(pp_phrase.lower()) 417 title_case = input_phrase.istitle() 418 if DEBUG_LEVEL > 1: 419 LOGGER.debug( 420 'input_phrase=%s p_phrase=%s pp_phrase=%s', 421 input_phrase, p_phrase, pp_phrase) 422 phrase_frequencies: Dict[str, float] = {} 423 if not ' ' in input_phrase: 424 # Get suggestions from hunspell dictionaries. But only 425 # if input_phrase does not contain spaces. The hunspell 426 # dictionaries contain only single words, not sentences. 427 # Trying to complete an input_phrase which contains spaces 428 # will never work and spell checking suggestions by hunspell 429 # for input which contains spaces is almost always nonsense. 430 phrase_frequencies.update([ 431 x for x in self.hunspell_obj.suggest(input_phrase)]) 432 if DEBUG_LEVEL > 1: 433 LOGGER.debug( 434 'hunspell: best_candidates=%s', 435 self.best_candidates(phrase_frequencies, title=title_case)) 436 # Remove the accents *after* getting the hunspell candidates. 437 # If the accents were removed before getting the hunspell candidates 438 # an input phrase like “Glühwürmchen” would not be added as a 439 # candidate because hunspell would get “Gluhwurmchen” then and would 440 # not validate that as a correct word. And, because “Glühwürmchen” 441 # is not in the German hunspell dictionary as a single word but 442 # created by suffix and prefix rules, the accent insensitive match 443 # in the German hunspell dictionary would not find it either. 444 input_phrase = itb_util.remove_accents(input_phrase.lower()) 445 # Now phrase_frequencies might contain something like this: 446 # 447 # {'code': 0, 'communicability': 0, 'cold': 0, 'colour': 0} 448 449 # To quote a string to be used as a parameter when assembling 450 # an sqlite statement with Python string operations, remove 451 # all NUL characters, replace " with "" and wrap the whole 452 # string in double quotes. Assembling sqlite statements using 453 # parameters containing user input with python string operations 454 # is not recommended because of the risk of SQL injection attacks 455 # if the quoting is not done the right way. So it is better to use 456 # the parameter substitution of the sqlite3 python interface. 457 # But unfortunately that does not work when creating views, 458 # (“OperationalError: parameters are not allowed in views”). 459 quoted_input_phrase = input_phrase.replace( 460 '\x00', '').replace('"', '""') 461 self.database.execute('DROP VIEW IF EXISTS like_input_phrase_view;') 462 sqlstr = ''' 463 CREATE TEMPORARY VIEW IF NOT EXISTS like_input_phrase_view AS 464 SELECT * FROM user_db.phrases 465 WHERE input_phrase LIKE "%(quoted_input_phrase)s%%" 466 ;''' % {'quoted_input_phrase': quoted_input_phrase} 467 try: 468 self.database.execute(sqlstr) 469 except Exception: 470 LOGGER.exception('Unexpected error in creating database view.') 471 sqlargs = {'p_phrase': p_phrase, 'pp_phrase': pp_phrase} 472 sqlstr = ( 473 'SELECT phrase, sum(user_freq) FROM like_input_phrase_view ' 474 'GROUP BY phrase;') 475 try: 476 # Get “unigram” data from user_db. 477 # 478 # Example: Let’s assume the user typed “co” and user_db contains 479 # 480 # 1|colou|colour|green|nice|1 481 # 2|col|colour|yellow|ugly|2 482 # 3|co|colour|green|awesome|1 483 # 4|co|cold|||1 484 # 5|conspirac|conspiracy|||5 485 # 6|conspi|conspiracy|||1 486 # 7|c|conspiracy|||1 487 results_uni = self.database.execute(sqlstr, sqlargs).fetchall() 488 # Then the result returned by .fetchall() is: 489 # 490 # [('colour', 4), ('cold', 1), ('conspiracy', 6)] 491 # 492 # (“c|conspiracy|1” is not selected because it doesn’t 493 # match the user input “LIKE co%”! I.e. this is filtered 494 # out by the VIEW created above already) 495 except Exception: 496 LOGGER.exception( 497 'Unexpected error getting “unigram” data from user_db.') 498 if not results_uni: 499 # If no unigrams matched, bigrams and trigrams cannot 500 # match either. We can stop here and return what we got 501 # from hunspell. 502 return self.best_candidates(phrase_frequencies, title=title_case) 503 # Now normalize the unigram frequencies with the total count 504 # (which is 11 in the above example), which gives us the 505 # normalized result: 506 # [('colour', 4/11), ('cold', 1/11), ('conspiracy', 6/11)] 507 sqlstr = 'SELECT sum(user_freq) FROM like_input_phrase_view;' 508 try: 509 count = self.database.execute(sqlstr, sqlargs).fetchall()[0][0] 510 except Exception: 511 LOGGER.exception( 512 'Unexpected error getting total unigram count from user_db') 513 # Updating the phrase_frequency dictionary with the normalized 514 # results gives: {'conspiracy': 6/11, 'code': 0, 515 # 'communicability': 0, 'cold': 1/11, 'colour': 4/11} 516 for result_uni in results_uni: 517 phrase_frequencies.update( 518 [(result_uni[0], result_uni[1]/float(count))]) 519 if DEBUG_LEVEL > 1: 520 LOGGER.debug( 521 'Unigram best_candidates=%s', 522 self.best_candidates(phrase_frequencies, title=title_case)) 523 if not p_phrase: 524 # If no context for bigram matching is available, return 525 # what we have so far: 526 return self.best_candidates(phrase_frequencies, title=title_case) 527 sqlstr = ( 528 'SELECT phrase, sum(user_freq) FROM like_input_phrase_view ' 529 'WHERE p_phrase = :p_phrase GROUP BY phrase;') 530 try: 531 results_bi = self.database.execute(sqlstr, sqlargs).fetchall() 532 except Exception: 533 LOGGER.exception( 534 'Unexpected error getting “bigram” data from user_db') 535 if not results_bi: 536 # If no bigram could be matched, return what we have so far: 537 return self.best_candidates(phrase_frequencies, title=title_case) 538 # get the total count of p_phrase to normalize the bigram frequencies: 539 sqlstr = ( 540 'SELECT sum(user_freq) FROM like_input_phrase_view ' 541 'WHERE p_phrase = :p_phrase;') 542 try: 543 count_p_phrase = self.database.execute( 544 sqlstr, sqlargs).fetchall()[0][0] 545 except Exception: 546 LOGGER.exception( 547 'Unexpected error getting total bigram count from user_db') 548 # Update the phrase frequency dictionary by using a linear 549 # combination of the unigram and the bigram results, giving 550 # both the weight of 0.5: 551 for result_bi in results_bi: 552 phrase_frequencies.update( 553 [(result_bi[0], 554 0.5*result_bi[1]/float(count_p_phrase) 555 +0.5*phrase_frequencies[result_bi[0]])]) 556 if DEBUG_LEVEL > 1: 557 LOGGER.debug( 558 'Bigram best_candidates=%s', 559 self.best_candidates(phrase_frequencies, title=title_case)) 560 if not pp_phrase: 561 # If no context for trigram matching is available, return 562 # what we have so far: 563 return self.best_candidates(phrase_frequencies, title=title_case) 564 sqlstr = ('SELECT phrase, sum(user_freq) FROM like_input_phrase_view ' 565 'WHERE p_phrase = :p_phrase ' 566 'AND pp_phrase = :pp_phrase GROUP BY phrase;') 567 try: 568 results_tri = self.database.execute(sqlstr, sqlargs).fetchall() 569 except Exception: 570 LOGGER.exception( 571 'Unexpected error getting “trigram” data from user_db') 572 if not results_tri: 573 # if no trigram could be matched, return what we have so far: 574 return self.best_candidates(phrase_frequencies, title=title_case) 575 # get the total count of (p_phrase, pp_phrase) pairs to 576 # normalize the bigram frequencies: 577 sqlstr = ( 578 'SELECT sum(user_freq) FROM like_input_phrase_view ' 579 'WHERE p_phrase = :p_phrase AND pp_phrase = :pp_phrase;') 580 try: 581 count_pp_phrase_p_phrase = self.database.execute( 582 sqlstr, sqlargs).fetchall()[0][0] 583 except Exception: 584 LOGGER.exception( 585 'Unexpected error getting total trigram count from user_db') 586 # Update the phrase frequency dictionary by using a linear 587 # combination of the bigram and the trigram results, giving 588 # both the weight of 0.5 (that makes the total weights: 0.25 * 589 # unigram + 0.25 * bigram + 0.5 * trigram, i.e. the trigrams 590 # get higher weight): 591 for result_tri in results_tri: 592 phrase_frequencies.update( 593 [(result_tri[0], 594 0.5*result_tri[1]/float(count_pp_phrase_p_phrase) 595 +0.5*phrase_frequencies[result_tri[0]])]) 596 if DEBUG_LEVEL > 1: 597 LOGGER.debug( 598 'Trigram best_candidates=%s', 599 self.best_candidates(phrase_frequencies, title=title_case)) 600 return self.best_candidates(phrase_frequencies, title=title_case) 601 602 def generate_userdb_desc(self) -> None: 603 ''' 604 Add a description table to the user database 605 606 This adds the database version and the create time 607 ''' 608 try: 609 sqlstring = ('CREATE TABLE IF NOT EXISTS user_db.desc ' 610 + '(name PRIMARY KEY, value);') 611 self.database.executescript(sqlstring) 612 sqlstring = 'INSERT OR IGNORE INTO user_db.desc VALUES (?, ?);' 613 self.database.execute( 614 sqlstring, ('version', USER_DATABASE_VERSION)) 615 sqlstring = ( 616 'INSERT OR IGNORE INTO user_db.desc ' 617 + 'VALUES (?, DATETIME("now", "localtime"));') 618 self.database.execute(sqlstring, ("create-time", )) 619 self.database.commit() 620 except Exception: 621 LOGGER.exception('Unexpected error adding description to user_db.') 622 623 def init_user_db(self) -> None: 624 ''' 625 Initialize the user database unless it is an in-memory database 626 ''' 627 if self.user_db_file == ':memory:': 628 return 629 if not path.exists(self.user_db_file): 630 self.database = sqlite3.connect(self.user_db_file) 631 self.set_database_pragma_options() 632 633 def set_database_pragma_options(self) -> None: 634 '''Set options for the user database''' 635 # a database containing the complete German Hunspell 636 # dictionary has less then 6000 pages. 20000 pages 637 # should be enough to cache the complete database 638 # in most cases. 639 self.database.executescript(''' 640 PRAGMA encoding = "UTF-8"; 641 PRAGMA case_sensitive_like = true; 642 PRAGMA page_size = 4096; 643 PRAGMA cache_size = 20000; 644 PRAGMA temp_store = MEMORY; 645 PRAGMA journal_mode = WAL; 646 PRAGMA journal_size_limit = 1000000; 647 PRAGMA synchronous = NORMAL; 648 PRAGMA auto_vacuum = FULL; 649 ''') 650 self.database.commit() 651 652 def get_database_desc(self, db_file: str) -> Optional[Dict[str, str]]: 653 '''Get the description of the database''' 654 if not path.exists(db_file): 655 return None 656 try: 657 database = sqlite3.connect(db_file) 658 desc = {} 659 for row in database.execute("SELECT * FROM desc;").fetchall(): 660 desc[row[0]] = row[1] 661 database.close() 662 return desc 663 except Exception: 664 LOGGER.exception('Unexpected error getting database description.') 665 return None 666 667 def get_number_of_columns_of_phrase_table( 668 self, db_file: str) -> Optional[int]: 669 # pylint: disable=line-too-long 670 ''' 671 Get the number of columns in the 'phrases' table in 672 the database in db_file. 673 674 Determines the number of columns by parsing this: 675 676 sqlite> select sql from sqlite_master where name='phrases'; 677CREATE TABLE phrases (id INTEGER PRIMARY KEY, input_phrase TEXT, phrase TEXT, p_phrase TEXT, pp_phrase TEXT, user_freq INTEGER, timestamp REAL) 678 sqlite> 679 680 This result could be on a single line, as above, or on multiple 681 lines. 682 ''' 683 # pylint: enable=line-too-long 684 if not path.exists(db_file): 685 return None 686 try: 687 database = sqlite3.connect(db_file) 688 table_phrases_result = database.execute( 689 "select sql from sqlite_master where name='phrases';" 690 ).fetchall() 691 # Remove possible line breaks from the string where we 692 # want to match: 693 string = ' '.join(table_phrases_result[0][0].splitlines()) 694 match = re.match(r'.*\((.*)\)', string) 695 if match: 696 table_phrases_columns = match.group(1).split(',') 697 return len(table_phrases_columns) 698 return 0 699 except Exception: 700 LOGGER.exception( 701 'Unexpected error getting number of columns of database.') 702 return 0 703 704 def list_user_shortcuts(self) -> List[Tuple[str, str]]: 705 '''Returns a list of user defined shortcuts from the user database. 706 ''' 707 sqlstr = ''' 708 SELECT input_phrase, phrase FROM user_db.phrases WHERE user_freq >= :freq 709 ;''' 710 sqlargs = {'freq': itb_util.SHORTCUT_USER_FREQ} 711 if DEBUG_LEVEL > 1: 712 LOGGER.debug('sqlstr=%s', sqlstr) 713 LOGGER.debug('sqlargs=%s', sqlargs) 714 result = self.database.execute(sqlstr, sqlargs).fetchall() 715 if DEBUG_LEVEL > 1: 716 LOGGER.debug('result=%s', result) 717 return result 718 719 def check_phrase_and_update_frequency( 720 self, 721 input_phrase: str = '', 722 phrase: str = '', 723 p_phrase: str = '', 724 pp_phrase: str = '', 725 user_freq_increment: int = 1, 726 commit: bool = True) -> None: 727 ''' 728 Check whether input_phrase and phrase are already in database. If 729 they are in the database, increase the frequency by 1, if not 730 add them. 731 ''' 732 if not input_phrase: 733 input_phrase = phrase 734 if not phrase: 735 return 736 phrase = unicodedata.normalize( 737 itb_util.NORMALIZATION_FORM_INTERNAL, phrase) 738 p_phrase = itb_util.remove_accents(p_phrase.lower()) 739 pp_phrase = itb_util.remove_accents(pp_phrase.lower()) 740 input_phrase = itb_util.remove_accents(input_phrase.lower()) 741 742 if DEBUG_LEVEL > 1: 743 LOGGER.debug( 744 'phrase=%(p)s, input_phrase=%(t)s', 745 {'p': phrase, 't': input_phrase}) 746 747 # There should never be more than 1 database row for the same 748 # input_phrase *and* phrase. So the following query on 749 # the database should match at most one database 750 # row and the length of the result array should be 0 or 751 # 1. So the “GROUP BY phrase” is actually redundant. It is 752 # only a safeguard for the case when duplicate rows have been 753 # added to the database accidentally (But in that case there 754 # is a bug somewhere else which should be fixed). 755 sqlstr = ''' 756 SELECT max(user_freq) FROM user_db.phrases 757 WHERE input_phrase = :input_phrase 758 AND phrase = :phrase AND p_phrase = :p_phrase AND pp_phrase = :pp_phrase 759 GROUP BY phrase 760 ;''' 761 sqlargs = {'input_phrase': input_phrase, 762 'phrase': phrase, 763 'p_phrase': p_phrase, 764 'pp_phrase': pp_phrase} 765 if DEBUG_LEVEL > 1: 766 LOGGER.debug( 767 'TabSqliteDb.check_phrase_and_update_frequency() sqlstr=%s', 768 sqlstr) 769 LOGGER.debug( 770 'TabSqliteDb.check_phrase_and_update_frequency() sqlargs=%s', 771 sqlargs) 772 result = self.database.execute(sqlstr, sqlargs).fetchall() 773 if DEBUG_LEVEL > 1: 774 LOGGER.debug( 775 'check_phrase_and_update_frequency() result=%s', result) 776 if result: 777 # A match was found in user_db, increase user frequency by 778 # user_freq_increment (1 by default) 779 self.update_phrase(input_phrase=input_phrase, 780 phrase=phrase, 781 p_phrase=p_phrase, 782 pp_phrase=pp_phrase, 783 user_freq=result[0][0]+user_freq_increment, 784 commit=commit) 785 return 786 # The phrase was not found in user_db. 787 # Add it as a new phrase, i.e. with user_freq = user_freq_increment 788 # (1 by default): 789 self.add_phrase(input_phrase=input_phrase, 790 phrase=phrase, 791 p_phrase=p_phrase, 792 pp_phrase=pp_phrase, 793 user_freq=user_freq_increment, 794 commit=commit) 795 return 796 797 def remove_phrase( 798 self, 799 input_phrase: str = '', 800 phrase: str = '', 801 commit: bool = True) -> None: 802 ''' 803 Remove all rows matching “input_phrase” and “phrase” from database. 804 Or, if “input_phrase” is “None”, remove all rows matching “phrase” 805 no matter for what input phrase from the database. 806 ''' 807 if DEBUG_LEVEL > 1: 808 LOGGER.debug( 809 'TabSqliteDb.remove_phrase() phrase=%(p)s', 810 {'p': phrase.encode('UTF-8')}) 811 if not phrase: 812 return 813 phrase = unicodedata.normalize( 814 itb_util.NORMALIZATION_FORM_INTERNAL, phrase) 815 if input_phrase: 816 input_phrase = unicodedata.normalize( 817 itb_util.NORMALIZATION_FORM_INTERNAL, input_phrase) 818 if input_phrase: 819 delete_sqlstr = ''' 820 DELETE FROM user_db.phrases 821 WHERE input_phrase = :input_phrase AND phrase = :phrase 822 ;''' 823 else: 824 delete_sqlstr = ''' 825 DELETE FROM user_db.phrases 826 WHERE phrase = :phrase 827 ;''' 828 delete_sqlargs = {'input_phrase': input_phrase, 'phrase': phrase} 829 self.database.execute(delete_sqlstr, delete_sqlargs) 830 if commit: 831 self.database.commit() 832 833 def extract_user_phrases(self) -> List[Tuple[str, int]]: 834 '''extract user phrases from database''' 835 try: 836 database = sqlite3.connect(self.user_db_file) 837 database.execute('PRAGMA wal_checkpoint;') 838 phrases = database.execute( 839 'SELECT phrase, sum(user_freq) FROM phrases GROUP BY phrase;' 840 ).fetchall() 841 database.close() 842 phrases = [ 843 (unicodedata.normalize( 844 itb_util.NORMALIZATION_FORM_INTERNAL, x[0]), x[1]) 845 for x in 846 phrases 847 ] 848 return phrases[:] 849 except Exception: 850 LOGGER.exception('Unexpected error extracting user phrases.') 851 return [] 852 853 def read_training_data_from_file(self, filename: str) -> bool: 854 ''' 855 Read data to train the prediction from a text file. 856 857 :param filename: Full path of the text file to read. 858 ''' 859 if not os.path.isfile(filename): 860 filename += '.gz' 861 if not os.path.isfile(filename): 862 return False 863 open_function: Callable = open 864 if filename.endswith('.gz'): 865 open_function = gzip.open 866 rows = self.database.execute( 867 'SELECT input_phrase, phrase, p_phrase, pp_phrase, ' 868 + 'user_freq, timestamp FROM phrases;').fetchall() 869 rows = sorted(rows, key = lambda x: (x[5])) # sort by timestamp 870 time_min = time.time() 871 time_max = time.time() 872 if rows: 873 time_min = rows[0][5] 874 time_max = rows[-1][5] 875 # timestamp for added entries (timestamp of existing entries is kept): 876 time_new = time_min + 0.20 * (time_max - time_min) 877 LOGGER.info('Minimum timestamp in the database=%s', 878 time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(time_min))) 879 LOGGER.info('Maximum timestamp in the database=%s', 880 time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(time_max))) 881 LOGGER.info('New timestamp in the database=%s', 882 time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(time_new))) 883 p_token = '' 884 pp_token = '' 885 database_dict: Dict[Tuple[str, str, str, str], Dict[str, Any]] = {} 886 for row in rows: 887 # itb_util.remove_accents() returns in NORMALIZATION_FORM_INTERNAL. 888 # row[1] (“phrase”) should already be in NORMALIZATION_FORM_INTERNAL 889 # but better convert it again here just to make sure. 890 input_phrase = itb_util.remove_accents(row[0].lower()) 891 phrase = unicodedata.normalize( 892 itb_util.NORMALIZATION_FORM_INTERNAL, row[1]) 893 p_phrase = itb_util.remove_accents(row[2].lower()) 894 pp_phrase = itb_util.remove_accents(row[3].lower()) 895 database_dict.update([((row[0], row[1], row[2], row[3]), 896 {'input_phrase': input_phrase, 897 'phrase': phrase, 898 'p_phrase': p_phrase, 899 'pp_phrase': pp_phrase, 900 'user_freq': row[4], 901 'timestamp': row[5]} 902 )]) 903 lines = [] 904 try: 905 with open_function( 906 filename, mode='rt', encoding='UTF-8') as file_handle: 907 lines = [ 908 unicodedata.normalize( 909 itb_util.NORMALIZATION_FORM_INTERNAL, line) 910 for line in file_handle.readlines()] 911 except Exception: 912 LOGGER.exception( 913 'Unexpected error reading training data from file.') 914 return False 915 for line in lines: 916 for token in itb_util.tokenize(line): 917 key = (itb_util.remove_accents(token.lower()), 918 token, 919 itb_util.remove_accents(p_token.lower()), 920 itb_util.remove_accents(pp_token.lower())) 921 if key in database_dict: 922 database_dict[key]['user_freq'] += 1 923 else: 924 database_dict[key] = { 925 'input_phrase': itb_util.remove_accents(token.lower()), 926 'phrase': token, 927 'p_phrase': itb_util.remove_accents(p_token.lower()), 928 'pp_phrase': itb_util.remove_accents(pp_token.lower()), 929 'user_freq': 1, 930 'timestamp': time_new} 931 pp_token = p_token 932 p_token = token 933 sqlargs = [] 934 for key in database_dict: 935 sqlargs.append(database_dict[key]) 936 sqlstr = ''' 937 INSERT INTO user_db.phrases (input_phrase, phrase, p_phrase, pp_phrase, user_freq, timestamp) 938 VALUES (:input_phrase, :phrase, :p_phrase, :pp_phrase, :user_freq, :timestamp) 939 ;''' 940 try: 941 self.database.execute('DELETE FROM phrases;') 942 # Without the following commit, the 943 # self.database.executemany() fails with 944 # “OperationalError: database is locked”. 945 self.database.commit() 946 self.database.executemany(sqlstr, sqlargs) 947 self.database.commit() 948 self.database.execute('PRAGMA wal_checkpoint;') 949 except Exception: 950 LOGGER.exception( 951 'Unexpected error writing training data to database.') 952 return False 953 return True 954 955 def remove_all_phrases(self) -> None: 956 ''' 957 Remove all phrases from the database, i.e. delete all the 958 data learned from user input or text files. 959 ''' 960 try: 961 self.database.execute('DELETE FROM phrases;') 962 self.database.commit() 963 self.database.execute('PRAGMA wal_checkpoint;') 964 except Exception: 965 LOGGER.exception( 966 'Unexpected error removing all phrases from database.') 967 968 def dump_database(self) -> None: 969 ''' 970 Dump the contents of the database to the log 971 972 (For debugging) 973 ''' 974 try: 975 LOGGER.debug('SELECT * FROM desc;\n') 976 for row in self.database.execute("SELECT * FROM desc;").fetchall(): 977 LOGGER.debug('%s', repr(row)) 978 LOGGER.debug('SELECT * FROM phrases;\n') 979 for row in self.database.execute( 980 "SELECT * FROM phrases;").fetchall(): 981 LOGGER.debug('%s', repr(row)) 982 except Exception: 983 LOGGER.exception('Unexpected error dumping database.') 984 return 985 986 def number_of_rows_in_database(self) -> int: 987 ''' 988 Return the current number of rows in the database 989 990 (For debugging) 991 ''' 992 try: 993 return len(self.database.execute( 994 "SELECT * FROM phrases;").fetchall()) 995 except Exception: 996 LOGGER.exception('Unexpected error getting number of database rows.') 997 return -1 998 999 def cleanup_database(self, thread: bool = True) -> None: 1000 '''Cleanup user database by expiring entries which have not been 1001 used for a long time. 1002 1003 :param thread: Whether this function is called in a different thread or not. 1004 1005 Usually it is called in a thread started when Typing Booster 1006 has just finished starting up and is ready for input. 1007 1008 But in unittest test cases it may be called with an in memory 1009 database without starting a thread. In that case a new 1010 database connection is not needed. 1011 1012 ''' 1013 if thread and self.user_db_file == ':memory:': 1014 LOGGER.info('Database cleanup not needed for memory database.') 1015 return 1016 LOGGER.info('Database cleanup starting ...') 1017 time_now = time.time() 1018 # id, input_phrase, phrase, p_phrase, pp_phrase, user_freq, timestamp 1019 rows: List[Tuple[int, str, str, str, str, int, float]] = [] 1020 try: 1021 if thread: 1022 # SQLite objects created in a thread can only be used in 1023 # that same thread. As the database cleanup is usually 1024 # called in a separate thread, get a new connection: 1025 database = sqlite3.connect(self.user_db_file) 1026 else: 1027 database = self.database 1028 rows = database.execute("SELECT * FROM phrases;").fetchall() 1029 except Exception: 1030 LOGGER.exception('Exception when accessing database') 1031 import traceback 1032 traceback.print_exc() 1033 if not rows: 1034 return 1035 rows = sorted(rows, 1036 key = lambda x: ( 1037 x[5], # user_freq 1038 x[6], # timestamp 1039 x[0], # id 1040 )) 1041 LOGGER.info('Total number of database rows to check=%s ', len(rows)) 1042 index = len(rows) 1043 max_rows = 50000 1044 number_delete_above_max = 0 1045 rows_kept: List[Tuple[int, str, str, str, str, int, float]] = [] 1046 for row in rows: 1047 user_freq = row[5] 1048 if (index > max_rows 1049 and user_freq < itb_util.SHORTCUT_USER_FREQ): 1050 LOGGER.info('1st pass: deleting %s %s', 1051 repr(row), 1052 time.strftime("%Y-%m-%d %H:%M:%S", 1053 time.gmtime(row[6]))) 1054 number_delete_above_max += 1 1055 sqlstr_delete = 'DELETE from phrases WHERE id = :id;' 1056 sqlargs_delete = {'id': row[0]} 1057 try: 1058 database.execute(sqlstr_delete, sqlargs_delete) 1059 except Exception: 1060 LOGGER.exception( 1061 '1st pass: exception deleting row from database.') 1062 import traceback 1063 traceback.print_exc() 1064 else: 1065 rows_kept.append(row) 1066 index -= 1 1067 LOGGER.info('1st pass: Number of rows deleted above maximum size=' 1068 f'{number_delete_above_max}') 1069 # As the first pass above removes rows sorted by count and 1070 # then by timestamp, it will never remove rows with a higher 1071 # count even if they are extremely old. Therefore, a second 1072 # pass uses sorting only by timestamp in order to first decay and 1073 # eventually remove some rows which have not been used for a 1074 # long time as well, even if they have a higher count. 1075 # In this second pass, the 0.1% oldest rows are checked 1076 # and: 1077 # 1078 # - if user_freq == 1 remove the row 1079 # - if user_freq > 1 divide user_freq by 2 and update timestamp to “now” 1080 # 1081 # 0.1% is really not much but I want to be careful not to remove 1082 # too much when trying this out. 1083 # 1084 # sort kept rows by timestamp only instead of user_freq and timestamp: 1085 rows_kept = sorted(rows_kept, 1086 key = lambda x: ( 1087 x[6], # timestamp 1088 x[0], # id 1089 )) 1090 index = len(rows_kept) 1091 LOGGER.info('1st pass: Number of rows kept=%s', index) 1092 index_decay = int(max_rows * 0.999) 1093 LOGGER.info('2nd pass: Index for decay=%s', index_decay) 1094 number_of_rows_to_decay = 0 1095 number_of_rows_to_delete = 0 1096 for row in rows_kept: 1097 user_freq = row[5] 1098 if (index > index_decay 1099 and user_freq < itb_util.SHORTCUT_USER_FREQ): 1100 if user_freq == 1: 1101 LOGGER.info('2nd pass: deleting %s %s', 1102 repr(row), 1103 time.strftime("%Y-%m-%d %H:%M:%S", 1104 time.gmtime(row[6]))) 1105 number_of_rows_to_delete += 1 1106 sqlstr_delete = 'DELETE from phrases WHERE id = :id;' 1107 sqlargs_delete = {'id': row[0]} 1108 try: 1109 database.execute(sqlstr_delete, sqlargs_delete) 1110 except Exception: 1111 LOGGER.exception( 1112 '2nd pass: exception deleting row from database.') 1113 import traceback 1114 traceback.print_exc() 1115 else: 1116 LOGGER.info('2nd pass: decaying %s %s', 1117 repr(row), 1118 time.strftime("%Y-%m-%d %H:%M:%S", 1119 time.gmtime(row[6]))) 1120 number_of_rows_to_decay += 1 1121 sqlstr_update = ''' 1122 UPDATE phrases 1123 SET user_freq = :user_freq, timestamp = :timestamp 1124 WHERE id = :id 1125 ;''' 1126 sqlargs_update = {'id': row[0], 1127 'user_freq': int(user_freq/2), 1128 'timestamp': time.time()} 1129 1130 try: 1131 database.execute(sqlstr_update, sqlargs_update) 1132 except Exception: 1133 LOGGER.exception( 1134 '2nd pass: exception decaying row from database.') 1135 import traceback 1136 traceback.print_exc() 1137 index -= 1 1138 LOGGER.info('Commit database and execute checkpoint ...') 1139 database.commit() 1140 database.execute('PRAGMA wal_checkpoint;') 1141 LOGGER.info('Rebuild database using VACUUM command ...') 1142 database.execute('VACUUM;') 1143 LOGGER.info('Number of database rows deleted=%s', 1144 number_delete_above_max + number_of_rows_to_delete) 1145 LOGGER.info('Number of database rows decayed=%s', 1146 number_of_rows_to_decay) 1147 LOGGER.info('Number of rows before cleanup=%s', len(rows)) 1148 LOGGER.info('Number of rows remaining=%s', 1149 len(rows_kept) - number_of_rows_to_delete) 1150 LOGGER.info('Time for database cleanup=%s seconds', 1151 time.time() - time_now) 1152 LOGGER.info('Database cleanup finished.') 1153