1# encoding: utf-8 2# Natural Language Toolkit: Senna Interface 3# 4# Copyright (C) 2001-2019 NLTK Project 5# Author: Rami Al-Rfou' <ralrfou@cs.stonybrook.edu> 6# URL: <http://nltk.org/> 7# For license information, see LICENSE.TXT 8 9""" 10A general interface to the SENNA pipeline that supports any of the 11operations specified in SUPPORTED_OPERATIONS. 12 13Applying multiple operations at once has the speed advantage. For example, 14Senna will automatically determine POS tags if you are extracting named 15entities. Applying both of the operations will cost only the time of 16extracting the named entities. 17 18The SENNA pipeline has a fixed maximum size of the sentences that it can read. 19By default it is 1024 token/sentence. If you have larger sentences, changing 20the MAX_SENTENCE_SIZE value in SENNA_main.c should be considered and your 21system specific binary should be rebuilt. Otherwise this could introduce 22misalignment errors. 23 24The input is: 25- path to the directory that contains SENNA executables. If the path is incorrect, 26 Senna will automatically search for executable file specified in SENNA environment variable 27- List of the operations needed to be performed. 28- (optionally) the encoding of the input data (default:utf-8) 29 30Note: Unit tests for this module can be found in test/unit/test_senna.py 31 32 >>> from __future__ import unicode_literals 33 >>> from nltk.classify import Senna 34 >>> pipeline = Senna('/usr/share/senna-v3.0', ['pos', 'chk', 'ner']) 35 >>> sent = 'Dusseldorf is an international business center'.split() 36 >>> [(token['word'], token['chk'], token['ner'], token['pos']) for token in pipeline.tag(sent)] # doctest: +SKIP 37 [('Dusseldorf', 'B-NP', 'B-LOC', 'NNP'), ('is', 'B-VP', 'O', 'VBZ'), ('an', 'B-NP', 'O', 'DT'), 38 ('international', 'I-NP', 'O', 'JJ'), ('business', 'I-NP', 'O', 'NN'), ('center', 'I-NP', 'O', 'NN')] 39""" 40 41 42from __future__ import unicode_literals 43from os import path, sep, environ 44from subprocess import Popen, PIPE 45from platform import architecture, system 46 47from six import text_type 48 49from nltk.tag.api import TaggerI 50from nltk.compat import python_2_unicode_compatible 51 52_senna_url = 'http://ml.nec-labs.com/senna/' 53 54 55@python_2_unicode_compatible 56class Senna(TaggerI): 57 58 SUPPORTED_OPERATIONS = ['pos', 'chk', 'ner'] 59 60 def __init__(self, senna_path, operations, encoding='utf-8'): 61 self._encoding = encoding 62 self._path = path.normpath(senna_path) + sep 63 64 # Verifies the existence of the executable on the self._path first 65 # senna_binary_file_1 = self.executable(self._path) 66 exe_file_1 = self.executable(self._path) 67 if not path.isfile(exe_file_1): 68 # Check for the system environment 69 if 'SENNA' in environ: 70 # self._path = path.join(environ['SENNA'],'') 71 self._path = path.normpath(environ['SENNA']) + sep 72 exe_file_2 = self.executable(self._path) 73 if not path.isfile(exe_file_2): 74 raise OSError( 75 "Senna executable expected at %s or %s but not found" 76 % (exe_file_1, exe_file_2) 77 ) 78 79 self.operations = operations 80 81 def executable(self, base_path): 82 """ 83 The function that determines the system specific binary that should be 84 used in the pipeline. In case, the system is not known the default senna binary will 85 be used. 86 """ 87 os_name = system() 88 if os_name == 'Linux': 89 bits = architecture()[0] 90 if bits == '64bit': 91 return path.join(base_path, 'senna-linux64') 92 return path.join(base_path, 'senna-linux32') 93 if os_name == 'Windows': 94 return path.join(base_path, 'senna-win32.exe') 95 if os_name == 'Darwin': 96 return path.join(base_path, 'senna-osx') 97 return path.join(base_path, 'senna') 98 99 def _map(self): 100 """ 101 A method that calculates the order of the columns that SENNA pipeline 102 will output the tags into. This depends on the operations being ordered. 103 """ 104 _map = {} 105 i = 1 106 for operation in Senna.SUPPORTED_OPERATIONS: 107 if operation in self.operations: 108 _map[operation] = i 109 i += 1 110 return _map 111 112 def tag(self, tokens): 113 """ 114 Applies the specified operation(s) on a list of tokens. 115 """ 116 return self.tag_sents([tokens])[0] 117 118 def tag_sents(self, sentences): 119 """ 120 Applies the tag method over a list of sentences. This method will return a 121 list of dictionaries. Every dictionary will contain a word with its 122 calculated annotations/tags. 123 """ 124 encoding = self._encoding 125 126 if not path.isfile(self.executable(self._path)): 127 raise OSError( 128 "Senna executable expected at %s but not found" 129 % self.executable(self._path) 130 ) 131 132 # Build the senna command to run the tagger 133 _senna_cmd = [ 134 self.executable(self._path), 135 '-path', 136 self._path, 137 '-usrtokens', 138 '-iobtags', 139 ] 140 _senna_cmd.extend(['-' + op for op in self.operations]) 141 142 # Serialize the actual sentences to a temporary string 143 _input = '\n'.join((' '.join(x) for x in sentences)) + '\n' 144 if isinstance(_input, text_type) and encoding: 145 _input = _input.encode(encoding) 146 147 # Run the tagger and get the output 148 p = Popen(_senna_cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE) 149 (stdout, stderr) = p.communicate(input=_input) 150 senna_output = stdout 151 152 # Check the return code. 153 if p.returncode != 0: 154 raise RuntimeError('Senna command failed! Details: %s' % stderr) 155 156 if encoding: 157 senna_output = stdout.decode(encoding) 158 159 # Output the tagged sentences 160 map_ = self._map() 161 tagged_sentences = [[]] 162 sentence_index = 0 163 token_index = 0 164 for tagged_word in senna_output.strip().split("\n"): 165 if not tagged_word: 166 tagged_sentences.append([]) 167 sentence_index += 1 168 token_index = 0 169 continue 170 tags = tagged_word.split('\t') 171 result = {} 172 for tag in map_: 173 result[tag] = tags[map_[tag]].strip() 174 try: 175 result['word'] = sentences[sentence_index][token_index] 176 except IndexError: 177 raise IndexError( 178 "Misalignment error occurred at sentence number %d. Possible reason" 179 " is that the sentence size exceeded the maximum size. Check the " 180 "documentation of Senna class for more information." 181 % sentence_index 182 ) 183 tagged_sentences[-1].append(result) 184 token_index += 1 185 return tagged_sentences 186 187 188# skip doctests if Senna is not installed 189def setup_module(module): 190 from nose import SkipTest 191 192 try: 193 tagger = Senna('/usr/share/senna-v3.0', ['pos', 'chk', 'ner']) 194 except OSError: 195 raise SkipTest("Senna executable not found") 196