1# Natural Language Toolkit: PanLex Corpus Reader
2#
3# Copyright (C) 2001-2019 NLTK Project
4# Author: David Kamholz <kamholz@panlex.org>
5# URL: <http://nltk.org/>
6# For license information, see LICENSE.TXT
7
8"""
9CorpusReader for PanLex Lite, a stripped down version of PanLex distributed
10as an SQLite database. See the README.txt in the panlex_lite corpus directory
11for more information on PanLex Lite.
12"""
13
14import os
15import sqlite3
16
17from nltk.corpus.reader.api import CorpusReader
18
19
20class PanLexLiteCorpusReader(CorpusReader):
21    MEANING_Q = """
22        SELECT dnx2.mn, dnx2.uq, dnx2.ap, dnx2.ui, ex2.tt, ex2.lv
23        FROM dnx
24        JOIN ex ON (ex.ex = dnx.ex)
25        JOIN dnx dnx2 ON (dnx2.mn = dnx.mn)
26        JOIN ex ex2 ON (ex2.ex = dnx2.ex)
27        WHERE dnx.ex != dnx2.ex AND ex.tt = ? AND ex.lv = ?
28        ORDER BY dnx2.uq DESC
29    """
30
31    TRANSLATION_Q = """
32        SELECT s.tt, sum(s.uq) AS trq FROM (
33            SELECT ex2.tt, max(dnx.uq) AS uq
34            FROM dnx
35            JOIN ex ON (ex.ex = dnx.ex)
36            JOIN dnx dnx2 ON (dnx2.mn = dnx.mn)
37            JOIN ex ex2 ON (ex2.ex = dnx2.ex)
38            WHERE dnx.ex != dnx2.ex AND ex.lv = ? AND ex.tt = ? AND ex2.lv = ?
39            GROUP BY ex2.tt, dnx.ui
40        ) s
41        GROUP BY s.tt
42        ORDER BY trq DESC, s.tt
43    """
44
45    def __init__(self, root):
46        self._c = sqlite3.connect(os.path.join(root, 'db.sqlite')).cursor()
47
48        self._uid_lv = {}
49        self._lv_uid = {}
50
51        for row in self._c.execute('SELECT uid, lv FROM lv'):
52            self._uid_lv[row[0]] = row[1]
53            self._lv_uid[row[1]] = row[0]
54
55    def language_varieties(self, lc=None):
56        """
57        Return a list of PanLex language varieties.
58
59        :param lc: ISO 639 alpha-3 code. If specified, filters returned varieties
60            by this code. If unspecified, all varieties are returned.
61        :return: the specified language varieties as a list of tuples. The first
62            element is the language variety's seven-character uniform identifier,
63            and the second element is its default name.
64        :rtype: list(tuple)
65        """
66
67        if lc is None:
68            return self._c.execute('SELECT uid, tt FROM lv ORDER BY uid').fetchall()
69        else:
70            return self._c.execute(
71                'SELECT uid, tt FROM lv WHERE lc = ? ORDER BY uid', (lc,)
72            ).fetchall()
73
74    def meanings(self, expr_uid, expr_tt):
75        """
76        Return a list of meanings for an expression.
77
78        :param expr_uid: the expression's language variety, as a seven-character
79            uniform identifier.
80        :param expr_tt: the expression's text.
81        :return: a list of Meaning objects.
82        :rtype: list(Meaning)
83        """
84
85        expr_lv = self._uid_lv[expr_uid]
86
87        mn_info = {}
88
89        for i in self._c.execute(self.MEANING_Q, (expr_tt, expr_lv)):
90            mn = i[0]
91            uid = self._lv_uid[i[5]]
92
93            if not mn in mn_info:
94                mn_info[mn] = {
95                    'uq': i[1],
96                    'ap': i[2],
97                    'ui': i[3],
98                    'ex': {expr_uid: [expr_tt]},
99                }
100
101            if not uid in mn_info[mn]['ex']:
102                mn_info[mn]['ex'][uid] = []
103
104            mn_info[mn]['ex'][uid].append(i[4])
105
106        return [Meaning(mn, mn_info[mn]) for mn in mn_info]
107
108    def translations(self, from_uid, from_tt, to_uid):
109        """
110        Return a list of translations for an expression into a single language
111            variety.
112
113        :param from_uid: the source expression's language variety, as a
114            seven-character uniform identifier.
115        :param from_tt: the source expression's text.
116        :param to_uid: the target language variety, as a seven-character
117            uniform identifier.
118        :return a list of translation tuples. The first element is the expression
119            text and the second element is the translation quality.
120        :rtype: list(tuple)
121        """
122
123        from_lv = self._uid_lv[from_uid]
124        to_lv = self._uid_lv[to_uid]
125
126        return self._c.execute(self.TRANSLATION_Q, (from_lv, from_tt, to_lv)).fetchall()
127
128
129class Meaning(dict):
130    """
131    Represents a single PanLex meaning. A meaning is a translation set derived
132    from a single source.
133    """
134
135    def __init__(self, mn, attr):
136        super(Meaning, self).__init__(**attr)
137        self['mn'] = mn
138
139    def id(self):
140        """
141        :return: the meaning's id.
142        :rtype: int
143        """
144        return self['mn']
145
146    def quality(self):
147        """
148        :return: the meaning's source's quality (0=worst, 9=best).
149        :rtype: int
150        """
151        return self['uq']
152
153    def source(self):
154        """
155        :return: the meaning's source id.
156        :rtype: int
157        """
158        return self['ap']
159
160    def source_group(self):
161        """
162        :return: the meaning's source group id.
163        :rtype: int
164        """
165        return self['ui']
166
167    def expressions(self):
168        """
169        :return: the meaning's expressions as a dictionary whose keys are language
170            variety uniform identifiers and whose values are lists of expression
171            texts.
172        :rtype: dict
173        """
174        return self['ex']
175