1# Natural Language Toolkit: PanLex Corpus Reader 2# 3# Copyright (C) 2001-2019 NLTK Project 4# Author: David Kamholz <kamholz@panlex.org> 5# URL: <http://nltk.org/> 6# For license information, see LICENSE.TXT 7 8""" 9CorpusReader for PanLex Lite, a stripped down version of PanLex distributed 10as an SQLite database. See the README.txt in the panlex_lite corpus directory 11for more information on PanLex Lite. 12""" 13 14import os 15import sqlite3 16 17from nltk.corpus.reader.api import CorpusReader 18 19 20class PanLexLiteCorpusReader(CorpusReader): 21 MEANING_Q = """ 22 SELECT dnx2.mn, dnx2.uq, dnx2.ap, dnx2.ui, ex2.tt, ex2.lv 23 FROM dnx 24 JOIN ex ON (ex.ex = dnx.ex) 25 JOIN dnx dnx2 ON (dnx2.mn = dnx.mn) 26 JOIN ex ex2 ON (ex2.ex = dnx2.ex) 27 WHERE dnx.ex != dnx2.ex AND ex.tt = ? AND ex.lv = ? 28 ORDER BY dnx2.uq DESC 29 """ 30 31 TRANSLATION_Q = """ 32 SELECT s.tt, sum(s.uq) AS trq FROM ( 33 SELECT ex2.tt, max(dnx.uq) AS uq 34 FROM dnx 35 JOIN ex ON (ex.ex = dnx.ex) 36 JOIN dnx dnx2 ON (dnx2.mn = dnx.mn) 37 JOIN ex ex2 ON (ex2.ex = dnx2.ex) 38 WHERE dnx.ex != dnx2.ex AND ex.lv = ? AND ex.tt = ? AND ex2.lv = ? 39 GROUP BY ex2.tt, dnx.ui 40 ) s 41 GROUP BY s.tt 42 ORDER BY trq DESC, s.tt 43 """ 44 45 def __init__(self, root): 46 self._c = sqlite3.connect(os.path.join(root, 'db.sqlite')).cursor() 47 48 self._uid_lv = {} 49 self._lv_uid = {} 50 51 for row in self._c.execute('SELECT uid, lv FROM lv'): 52 self._uid_lv[row[0]] = row[1] 53 self._lv_uid[row[1]] = row[0] 54 55 def language_varieties(self, lc=None): 56 """ 57 Return a list of PanLex language varieties. 58 59 :param lc: ISO 639 alpha-3 code. If specified, filters returned varieties 60 by this code. If unspecified, all varieties are returned. 61 :return: the specified language varieties as a list of tuples. The first 62 element is the language variety's seven-character uniform identifier, 63 and the second element is its default name. 64 :rtype: list(tuple) 65 """ 66 67 if lc is None: 68 return self._c.execute('SELECT uid, tt FROM lv ORDER BY uid').fetchall() 69 else: 70 return self._c.execute( 71 'SELECT uid, tt FROM lv WHERE lc = ? ORDER BY uid', (lc,) 72 ).fetchall() 73 74 def meanings(self, expr_uid, expr_tt): 75 """ 76 Return a list of meanings for an expression. 77 78 :param expr_uid: the expression's language variety, as a seven-character 79 uniform identifier. 80 :param expr_tt: the expression's text. 81 :return: a list of Meaning objects. 82 :rtype: list(Meaning) 83 """ 84 85 expr_lv = self._uid_lv[expr_uid] 86 87 mn_info = {} 88 89 for i in self._c.execute(self.MEANING_Q, (expr_tt, expr_lv)): 90 mn = i[0] 91 uid = self._lv_uid[i[5]] 92 93 if not mn in mn_info: 94 mn_info[mn] = { 95 'uq': i[1], 96 'ap': i[2], 97 'ui': i[3], 98 'ex': {expr_uid: [expr_tt]}, 99 } 100 101 if not uid in mn_info[mn]['ex']: 102 mn_info[mn]['ex'][uid] = [] 103 104 mn_info[mn]['ex'][uid].append(i[4]) 105 106 return [Meaning(mn, mn_info[mn]) for mn in mn_info] 107 108 def translations(self, from_uid, from_tt, to_uid): 109 """ 110 Return a list of translations for an expression into a single language 111 variety. 112 113 :param from_uid: the source expression's language variety, as a 114 seven-character uniform identifier. 115 :param from_tt: the source expression's text. 116 :param to_uid: the target language variety, as a seven-character 117 uniform identifier. 118 :return a list of translation tuples. The first element is the expression 119 text and the second element is the translation quality. 120 :rtype: list(tuple) 121 """ 122 123 from_lv = self._uid_lv[from_uid] 124 to_lv = self._uid_lv[to_uid] 125 126 return self._c.execute(self.TRANSLATION_Q, (from_lv, from_tt, to_lv)).fetchall() 127 128 129class Meaning(dict): 130 """ 131 Represents a single PanLex meaning. A meaning is a translation set derived 132 from a single source. 133 """ 134 135 def __init__(self, mn, attr): 136 super(Meaning, self).__init__(**attr) 137 self['mn'] = mn 138 139 def id(self): 140 """ 141 :return: the meaning's id. 142 :rtype: int 143 """ 144 return self['mn'] 145 146 def quality(self): 147 """ 148 :return: the meaning's source's quality (0=worst, 9=best). 149 :rtype: int 150 """ 151 return self['uq'] 152 153 def source(self): 154 """ 155 :return: the meaning's source id. 156 :rtype: int 157 """ 158 return self['ap'] 159 160 def source_group(self): 161 """ 162 :return: the meaning's source group id. 163 :rtype: int 164 """ 165 return self['ui'] 166 167 def expressions(self): 168 """ 169 :return: the meaning's expressions as a dictionary whose keys are language 170 variety uniform identifiers and whose values are lists of expression 171 texts. 172 :rtype: dict 173 """ 174 return self['ex'] 175