1# encoding: utf-8
2# Natural Language Toolkit: Senna Interface
3#
4# Copyright (C) 2001-2019 NLTK Project
5# Author: Rami Al-Rfou' <ralrfou@cs.stonybrook.edu>
6# URL: <http://nltk.org/>
7# For license information, see LICENSE.TXT
8
9"""
10A general interface to the SENNA pipeline that supports any of the
11operations specified in SUPPORTED_OPERATIONS.
12
13Applying multiple operations at once has the speed advantage. For example,
14Senna will automatically determine POS tags if you are extracting named
15entities. Applying both of the operations will cost only the time of
16extracting the named entities.
17
18The SENNA pipeline has a fixed maximum size of the sentences that it can read.
19By default it is 1024 token/sentence. If you have larger sentences, changing
20the MAX_SENTENCE_SIZE value in SENNA_main.c should be considered and your
21system specific binary should be rebuilt. Otherwise this could introduce
22misalignment errors.
23
24The input is:
25- path to the directory that contains SENNA executables. If the path is incorrect,
26   Senna will automatically search for executable file specified in SENNA environment variable
27- List of the operations needed to be performed.
28- (optionally) the encoding of the input data (default:utf-8)
29
30Note: Unit tests for this module can be found in test/unit/test_senna.py
31
32    >>> from __future__ import unicode_literals
33    >>> from nltk.classify import Senna
34    >>> pipeline = Senna('/usr/share/senna-v3.0', ['pos', 'chk', 'ner'])
35    >>> sent = 'Dusseldorf is an international business center'.split()
36    >>> [(token['word'], token['chk'], token['ner'], token['pos']) for token in pipeline.tag(sent)] # doctest: +SKIP
37    [('Dusseldorf', 'B-NP', 'B-LOC', 'NNP'), ('is', 'B-VP', 'O', 'VBZ'), ('an', 'B-NP', 'O', 'DT'),
38    ('international', 'I-NP', 'O', 'JJ'), ('business', 'I-NP', 'O', 'NN'), ('center', 'I-NP', 'O', 'NN')]
39"""
40
41
42from __future__ import unicode_literals
43from os import path, sep, environ
44from subprocess import Popen, PIPE
45from platform import architecture, system
46
47from six import text_type
48
49from nltk.tag.api import TaggerI
50from nltk.compat import python_2_unicode_compatible
51
52_senna_url = 'http://ml.nec-labs.com/senna/'
53
54
55@python_2_unicode_compatible
56class Senna(TaggerI):
57
58    SUPPORTED_OPERATIONS = ['pos', 'chk', 'ner']
59
60    def __init__(self, senna_path, operations, encoding='utf-8'):
61        self._encoding = encoding
62        self._path = path.normpath(senna_path) + sep
63
64        # Verifies the existence of the executable on the self._path first
65        # senna_binary_file_1 = self.executable(self._path)
66        exe_file_1 = self.executable(self._path)
67        if not path.isfile(exe_file_1):
68            # Check for the system environment
69            if 'SENNA' in environ:
70                # self._path = path.join(environ['SENNA'],'')
71                self._path = path.normpath(environ['SENNA']) + sep
72                exe_file_2 = self.executable(self._path)
73                if not path.isfile(exe_file_2):
74                    raise OSError(
75                        "Senna executable expected at %s or %s but not found"
76                        % (exe_file_1, exe_file_2)
77                    )
78
79        self.operations = operations
80
81    def executable(self, base_path):
82        """
83        The function that determines the system specific binary that should be
84        used in the pipeline. In case, the system is not known the default senna binary will
85        be used.
86        """
87        os_name = system()
88        if os_name == 'Linux':
89            bits = architecture()[0]
90            if bits == '64bit':
91                return path.join(base_path, 'senna-linux64')
92            return path.join(base_path, 'senna-linux32')
93        if os_name == 'Windows':
94            return path.join(base_path, 'senna-win32.exe')
95        if os_name == 'Darwin':
96            return path.join(base_path, 'senna-osx')
97        return path.join(base_path, 'senna')
98
99    def _map(self):
100        """
101        A method that calculates the order of the columns that SENNA pipeline
102        will output the tags into. This depends on the operations being ordered.
103        """
104        _map = {}
105        i = 1
106        for operation in Senna.SUPPORTED_OPERATIONS:
107            if operation in self.operations:
108                _map[operation] = i
109                i += 1
110        return _map
111
112    def tag(self, tokens):
113        """
114        Applies the specified operation(s) on a list of tokens.
115        """
116        return self.tag_sents([tokens])[0]
117
118    def tag_sents(self, sentences):
119        """
120        Applies the tag method over a list of sentences. This method will return a
121        list of dictionaries. Every dictionary will contain a word with its
122        calculated annotations/tags.
123        """
124        encoding = self._encoding
125
126        if not path.isfile(self.executable(self._path)):
127            raise OSError(
128                "Senna executable expected at %s but not found"
129                % self.executable(self._path)
130            )
131
132        # Build the senna command to run the tagger
133        _senna_cmd = [
134            self.executable(self._path),
135            '-path',
136            self._path,
137            '-usrtokens',
138            '-iobtags',
139        ]
140        _senna_cmd.extend(['-' + op for op in self.operations])
141
142        # Serialize the actual sentences to a temporary string
143        _input = '\n'.join((' '.join(x) for x in sentences)) + '\n'
144        if isinstance(_input, text_type) and encoding:
145            _input = _input.encode(encoding)
146
147        # Run the tagger and get the output
148        p = Popen(_senna_cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE)
149        (stdout, stderr) = p.communicate(input=_input)
150        senna_output = stdout
151
152        # Check the return code.
153        if p.returncode != 0:
154            raise RuntimeError('Senna command failed! Details: %s' % stderr)
155
156        if encoding:
157            senna_output = stdout.decode(encoding)
158
159        # Output the tagged sentences
160        map_ = self._map()
161        tagged_sentences = [[]]
162        sentence_index = 0
163        token_index = 0
164        for tagged_word in senna_output.strip().split("\n"):
165            if not tagged_word:
166                tagged_sentences.append([])
167                sentence_index += 1
168                token_index = 0
169                continue
170            tags = tagged_word.split('\t')
171            result = {}
172            for tag in map_:
173                result[tag] = tags[map_[tag]].strip()
174            try:
175                result['word'] = sentences[sentence_index][token_index]
176            except IndexError:
177                raise IndexError(
178                    "Misalignment error occurred at sentence number %d. Possible reason"
179                    " is that the sentence size exceeded the maximum size. Check the "
180                    "documentation of Senna class for more information."
181                    % sentence_index
182                )
183            tagged_sentences[-1].append(result)
184            token_index += 1
185        return tagged_sentences
186
187
188# skip doctests if Senna is not installed
189def setup_module(module):
190    from nose import SkipTest
191
192    try:
193        tagger = Senna('/usr/share/senna-v3.0', ['pos', 'chk', 'ner'])
194    except OSError:
195        raise SkipTest("Senna executable not found")
196