1# -*- coding: utf8 -*- 2""" 3High-level Python bindings build on top of the low-level C API (clinkgrammar) 4See http://www.abisource.com/projects/link-grammar/api/index.html to get 5more information about C API. 6""" 7 8try: 9 #pylint: disable=no-name-in-module 10 import linkgrammar.clinkgrammar as clg 11except ImportError: 12 #pylint: disable=import-error 13 import clinkgrammar as clg 14 15Clinkgrammar = clg 16__all__ = ['ParseOptions', 'Dictionary', 'Link', 'Linkage', 'Sentence', 17 'LG_Error', 'LG_DictionaryError', 'LG_TimerExhausted', 'Clinkgrammar'] 18 19# A decorator to ensure keyword-only arguments to __init__ (besides self). 20# In Python3 it can be done by using "*" as the second __init__ argument, 21# but here it is done with a decorator so it will work also in Python2. 22def kwargs_only(init): 23 def new_init(self_, *args, **kwargs): 24 if args: 25 raise TypeError("{}: Positional arguments are " 26 "not allowed".format(self_.__class__.__name__)) 27 return init(self_, **kwargs) 28 return new_init 29 30class ParseOptions(object): 31 @kwargs_only 32 def __init__(self, verbosity=0, 33 linkage_limit=100, 34 min_null_count=0, 35 max_null_count=0, 36 islands_ok=False, 37 short_length=16, 38 all_short_connectors=False, 39 display_morphology=True, 40 spell_guess=False, 41 use_sat=False, 42 max_parse_time=-1, 43 disjunct_cost=2.7, 44 repeatable_rand=True, 45 test='', 46 debug='', 47 dialect='', 48 ): 49 50 self._obj = clg.parse_options_create() 51 self.verbosity = verbosity 52 self.linkage_limit = linkage_limit 53 self.min_null_count = min_null_count 54 self.max_null_count = max_null_count 55 self.islands_ok = islands_ok 56 self.short_length = short_length 57 self.all_short_connectors = all_short_connectors 58 self.display_morphology = display_morphology 59 self.spell_guess = spell_guess 60 self.use_sat = use_sat 61 self.max_parse_time = max_parse_time 62 self.disjunct_cost = disjunct_cost 63 self.repeatable_rand = repeatable_rand 64 self.test = test 65 self.debug = debug 66 self.dialect = dialect 67 68 # Allow only the attribute names listed below. 69 def __setattr__(self, name, value): 70 if not hasattr(self, name) and name != "_obj": 71 # TypeError for consistency. It maybe should have been NameError. 72 raise TypeError('Unknown parse option "{}".'.format(name)) 73 super(ParseOptions, self).__setattr__(name, value) 74 75 def __del__(self): 76 if hasattr(self, '_obj'): 77 clg.parse_options_delete(self._obj) 78 del self._obj 79 80 @property 81 def test(self): 82 return clg.parse_options_get_test(self._obj) 83 84 @test.setter 85 def test(self, value): 86 if not isinstance(value, str): 87 raise TypeError("test must be set to a string") 88 return clg.parse_options_set_test(self._obj, value) 89 90 @property 91 def debug(self): 92 return clg.parse_options_get_debug(self._obj) 93 94 @debug.setter 95 def debug(self, value): 96 if not isinstance(value, str): 97 raise TypeError("dialect must be set to a string") 98 return clg.parse_options_set_debug(self._obj, value) 99 100 @property 101 def dialect(self): 102 return clg.parse_options_get_dialect(self._obj) 103 104 @debug.setter 105 def dialect(self, value): 106 if not isinstance(value, str): 107 raise TypeError("dialect must be set to a string") 108 return clg.parse_options_set_dialect(self._obj, value) 109 110 @property 111 def verbosity(self): 112 """ 113 This is the level of description printed to stderr/stdout about the 114 parsing process. 115 """ 116 return clg.parse_options_get_verbosity(self._obj) 117 118 @verbosity.setter 119 def verbosity(self, value): 120 if not isinstance(value, int): 121 raise TypeError("verbosity must be set to an integer") 122 if value not in range(0,120): 123 raise ValueError("Verbosity levels can be any integer between 0 and 120 inclusive") 124 clg.parse_options_set_verbosity(self._obj, value) 125 126 @property 127 def linkage_limit(self): 128 """ 129 This parameter determines the maximum number of linkages that are 130 considered in post-processing. If more than linkage_limit linkages 131 found, then a random sample of linkage_limit is chosen for 132 post-processing. When this happen a warning is displayed at verbosity 133 levels bigger than 1. 134 """ 135 return clg.parse_options_get_linkage_limit(self._obj) 136 137 @linkage_limit.setter 138 def linkage_limit(self, value): 139 if not isinstance(value, int): 140 raise TypeError("linkage_limit must be set to an integer") 141 if value < 0: 142 raise ValueError("linkage_limit must be positive") 143 clg.parse_options_set_linkage_limit(self._obj, value) 144 145 @property 146 def disjunct_cost(self): 147 """ 148 Determines the maximum disjunct cost used during parsing, where the 149 cost of a disjunct is equal to the maximum cost of all of its connectors. 150 The default is that only disjuncts of cost 2.7 or less are considered. 151 """ 152 return clg.parse_options_get_disjunct_cost(self._obj) 153 154 @disjunct_cost.setter 155 def disjunct_cost(self, value): 156 if not isinstance(value, float): 157 raise TypeError("disjunct_cost must be set to a float") 158 clg.parse_options_set_disjunct_cost(self._obj, value) 159 160 @property 161 def min_null_count(self): 162 """ 163 The minimum number of null links that a parse might have. A call to 164 sentence_parse will find all linkages having the minimum number of 165 null links within the range specified by this parameter. 166 """ 167 return clg.parse_options_get_min_null_count(self._obj) 168 169 @min_null_count.setter 170 def min_null_count(self, value): 171 if not isinstance(value, int): 172 raise TypeError("min_null_count must be set to an integer") 173 if value < 0: 174 raise ValueError("min_null_count must be positive") 175 clg.parse_options_set_min_null_count(self._obj, value) 176 177 @property 178 def max_null_count(self): 179 """ 180 The maximum number of null links that a parse might have. A call to 181 sentence_parse will find all linkages having the minimum number of 182 null links within the range specified by this parameter. 183 """ 184 return clg.parse_options_get_max_null_count(self._obj) 185 186 @max_null_count.setter 187 def max_null_count(self, value): 188 if not isinstance(value, int): 189 raise TypeError("max_null_count must be set to an integer") 190 if value < 0: 191 raise ValueError("max_null_count must be positive") 192 clg.parse_options_set_max_null_count(self._obj, value) 193 194 @property 195 def short_length(self): 196 """ 197 The short_length parameter determines how long the links are allowed 198 to be. The intended use of this is to speed up parsing by not 199 considering very long links for most connectors, since they are very 200 rarely used in a correct parse. An entry for UNLIMITED-CONNECTORS in 201 the dictionary will specify which connectors are exempt from the 202 length limit. 203 """ 204 return clg.parse_options_get_short_length(self._obj) 205 206 @short_length.setter 207 def short_length(self, value): 208 if not isinstance(value, int): 209 raise TypeError("short_length must be set to an integer") 210 if value < 0: 211 raise ValueError("short_length must be positive") 212 clg.parse_options_set_short_length(self._obj, value) 213 214 @property 215 def islands_ok(self): 216 """ 217 This option determines whether or not "islands" of links are allowed. 218 For example, the following linkage has an island: 219 +------Wd-----+ 220 | +--Dsu--+---Ss--+-Paf-+ +--Dsu--+---Ss--+--Pa-+ 221 | | | | | | | | | 222 ///// this sentence.n is.v false.a this sentence.n is.v true.a 223 """ 224 return clg.parse_options_get_islands_ok(self._obj) == 1 225 226 @islands_ok.setter 227 def islands_ok(self, value): 228 if not isinstance(value, bool): 229 raise TypeError("islands_ok must be set to a bool") 230 clg.parse_options_set_islands_ok(self._obj, 1 if value else 0) 231 232 @property 233 def max_parse_time(self): 234 """ 235 Determines the approximate maximum time (in seconds) that parsing is 236 allowed to take. After this time has expired, the parsing process is 237 artificially forced to complete quickly by pretending that no further 238 solutions can be constructed. The actual parsing time might be 239 slightly longer. 240 """ 241 return clg.parse_options_get_max_parse_time(self._obj) 242 243 @max_parse_time.setter 244 def max_parse_time(self, value): 245 if not isinstance(value, int): 246 raise TypeError("max_parse_time must be set to an integer") 247 clg.parse_options_set_max_parse_time(self._obj, value) 248 249 @property 250 def display_morphology(self): 251 """ 252 Whether or not to show word morphology when a linkage diagram is printed. 253 """ 254 return clg.parse_options_get_display_morphology(self._obj) == 1 255 256 @display_morphology.setter 257 def display_morphology(self, value): 258 if not isinstance(value, bool): 259 raise TypeError("display_morphology must be set to a bool") 260 clg.parse_options_set_display_morphology(self._obj, 1 if value else 0) 261 262 @property 263 def spell_guess(self): 264 """ 265 If greater then 0, the spelling guesser is used on unknown words. 266 In that case, it performs at most this number of spell corrections 267 per word, and performs run-on corrections which are not limited in 268 their number. If 0 - the spelling guesser would not be used. 269 """ 270 return clg.parse_options_get_spell_guess(self._obj) 271 272 @spell_guess.setter 273 def spell_guess(self, value): 274 """ 275 If the value is an int, it is the maximum number of spell corrections 276 per word. If it is True, an int value of 7 is assumed. A value of 277 0 or False disables the spelling guesser. 278 In case the spelling guesser is not disabled, run-on corrections will 279 be issued too, not limited in their number. 280 """ 281 if not isinstance(value, bool) and (not isinstance(value, int) or value < 0): 282 raise TypeError("spell_guess must be set to bool or a non-negative integer") 283 if isinstance(value, bool): 284 value = 7 if value else 0 285 clg.parse_options_set_spell_guess(self._obj, value) 286 287 @property 288 def use_sat(self): 289 """ 290 To be used after enabling the use of the SAT solver in order to 291 validate that it is supported by the LG library. 292 """ 293 return clg.parse_options_get_use_sat_parser(self._obj) 294 295 @use_sat.setter 296 def use_sat(self, value): 297 if not isinstance(value, bool): 298 raise TypeError("use_sat must be set to a bool") 299 clg.parse_options_set_use_sat_parser(self._obj, value) 300 301 @property 302 def all_short_connectors(self): 303 """ 304 If true, then all connectors have length restrictions imposed on 305 them -- they can be no farther than short_length apart. This is 306 used when parsing in \"panic\" mode, for example. 307 """ 308 return clg.parse_options_get_all_short_connectors(self._obj) == 1 309 310 @all_short_connectors.setter 311 def all_short_connectors(self, value): 312 if not isinstance(value, bool): 313 raise TypeError("all_short_connectors must be set to a bool") 314 clg.parse_options_set_all_short_connectors(self._obj, 1 if value else 0) 315 316 @property 317 def repeatable_rand(self): 318 """ 319 If set to True, then a repeatable random sequence will be used, whenever 320 a random number is required. The parser almost never uses random 321 numbers; currently they are only used in one place: to sample a subset 322 of linkages, if there are more parses than linkage_limit. 323 """ 324 return clg.parse_options_get_repeatable_rand(self._obj) == 1 325 326 @repeatable_rand.setter 327 def repeatable_rand(self, value): 328 if not isinstance(value, bool): 329 raise TypeError("repeatable_rand must be set to a bool") 330 clg.parse_options_set_repeatable_rand(self._obj, 1 if value else 0) 331 332 333class LG_Error(Exception): 334 @staticmethod 335 def set_handler(ehandler_function, ehandler_data=None): 336 old_handler = clg._py_error_set_handler((ehandler_function, ehandler_data)) 337 if isinstance(old_handler, str): 338 return LG_Error._default_handler 339 return old_handler 340 341 # lg_error_formatmsg is implemented as method "formatmsg" on errinfo 342 #@staticmethod 343 #def format(lgerror): 344 # return clg.lg_error_formatmsg(lgerror) 345 346 @staticmethod 347 def flush(): 348 return clg.lg_error_flush() 349 350 @staticmethod 351 def printall(ehandler_func, ehandler_data=None): 352 return clg._py_error_printall((ehandler_func, ehandler_data)) 353 354 @staticmethod 355 def clearall(): 356 return clg.lg_error_clearall() 357 358 @staticmethod 359 def message(msg): 360 """Print a message through the LG error facility""" 361 # Propagate a possible ending "\n" into the format, from which the LG 362 # error facility determine if this is a partial or a complete message. 363 if msg[-1:] == "\n": # a newline-ended complete message 364 _local_eol = "\n" 365 msg = msg[:-1] 366 elif msg[-2:] == "\n\\": # a newline-ended partial message 367 _local_eol = "" 368 msg = msg[:-1] 369 else: 370 _local_eol = "" # a partial message 371 372 return clg._prt_error('%s'+_local_eol, msg) 373 374 @staticmethod 375 def _default_handler(errinfo, data): 376 # Exceptions (on data): TypeError, ValueError 377 clg._py_error_default_handler(errinfo, data) 378 379class LG_DictionaryError(LG_Error): 380 pass 381 382class Dictionary(object): 383 def __init__(self, lang='en'): 384 self._obj = clg.dictionary_create_lang(lang) 385 if not self._obj: 386 # We should get the error message from the library. 387 raise LG_DictionaryError('Error: Failed to open dictionary {!r}'.format(lang)) 388 389 def __str__(self): 390 return clg.dictionary_get_lang(self._obj) 391 392 def __del__(self): 393 if hasattr(self, '_obj'): 394 clg.dictionary_delete(self._obj) 395 del self._obj 396 397 def __nonzero__(self): 398 """Return False iff the dictionary could not be opened or has been closed""" 399 if not hasattr(self, '_obj'): 400 return False 401 return bool(self._obj) 402 403 __bool__ = __nonzero__ # Account python3 404 405 def linkgrammar_get_dict_version(self): 406 return clg.linkgrammar_get_dict_version(self._obj) 407 408 def linkgrammar_get_dict_locale(self): 409 return clg.linkgrammar_get_dict_locale(self._obj) 410 411 412class Link(object): 413 def __init__(self, linkage, index, left_word, left_label, right_label, right_word): 414 self.linkage, self.index = linkage, index 415 self.left_word, self.right_word, self.left_label, self.right_label = \ 416 left_word, right_word, left_label, right_label 417 418 def __eq__(self, other): 419 return self.left_word == other.left_word and self.left_label == other.left_label and \ 420 self.right_word == other.right_word and self.right_label == other.right_label 421 422 def __str__(self): 423 if self.left_label == self.right_label: 424 return u"%s-%s-%s" % (self.left_word, self.left_label, self.right_word) 425 else: 426 return u"%s-%s-%s-%s" % (self.left_word, self.left_label, self.right_label, self.right_word) 427 428 def __unicode__(self): 429 return self.__str__() 430 431 def __repr__(self): 432 return u"Link: %s" % self.__str__() 433 434 def __len__(self): 435 return clg.linkage_get_link_length(self.linkage._obj, self.index) 436 437 def num_domains(self): 438 return clg.linkage_get_link_num_domains(self.linkage._obj, self.index) 439 440 441 442class Linkage(object): 443 444 def __init__(self, idx, sentence, parse_options): 445 # Keep all args passed into clg.* functions. 446 self.sentence, self.parse_options = sentence, parse_options 447 self._obj = clg.linkage_create(idx, sentence._obj, parse_options) 448 449 def __del__(self): 450 if hasattr(self, '_obj'): 451 clg.linkage_delete(self._obj) 452 del self._obj 453 454 def __nonzero__(self): 455 """Return False for SAT sentinel value (NULL); else return True.""" 456 return bool(self._obj) 457 458 __bool__ = __nonzero__ # Account python3 459 460 461 def num_of_words(self): 462 return clg.linkage_get_num_words(self._obj) 463 464 def num_of_links(self): 465 return clg.linkage_get_num_links(self._obj) 466 467 def words(self): 468 for i in range(self.num_of_words()): 469 yield self.word(i) 470 471 def word(self, i): 472 return clg.linkage_get_word(self._obj, i) 473 474 def unused_word_cost(self): 475 return clg.linkage_unused_word_cost(self._obj) 476 477 def link_cost(self): 478 return clg.linkage_link_cost(self._obj) 479 480 def disjunct_cost(self): 481 return clg.linkage_disjunct_cost(self._obj) 482 483 def link(self, i): 484 return Link(self, i, self.word(clg.linkage_get_link_lword(self._obj, i)), 485 clg.linkage_get_link_llabel(self._obj, i), 486 clg.linkage_get_link_rlabel(self._obj, i), 487 self.word(clg.linkage_get_link_rword(self._obj, i))) 488 489 def links(self): 490 for i in range(self.num_of_links()): 491 yield self.link(i) 492 493 def violation_name(self): 494 return clg.linkage_get_violation_name(self._obj) 495 496 def diagram(self, display_walls=False, screen_width=180): 497 return clg.linkage_print_diagram(self._obj, display_walls, screen_width) 498 499 def postscript(self, display_walls=True, print_ps_header=False): 500 return clg.linkage_print_postscript(self._obj, display_walls, print_ps_header) 501 502 def constituent_tree(self, mode=1): 503 return clg.linkage_print_constituent_tree(self._obj, mode) 504 505 def word_byte_start(self, w): 506 return clg.linkage_get_word_byte_start(self._obj, w) 507 508 def word_byte_end(self, w): 509 return clg.linkage_get_word_byte_end(self._obj, w) 510 511 def word_char_start(self, w): 512 return clg.linkage_get_word_char_start(self._obj, w) 513 514 def word_char_end(self, w): 515 return clg.linkage_get_word_char_end(self._obj, w) 516 517 518class LG_TimerExhausted(LG_Error): 519 pass 520 521class Sentence(object): 522 """ 523 sent = Sentence("This is a test.", Dictionary(), ParseOptions()) 524 # split() before parse() is optional. 525 # split() has ParseOptions as an optional argument 526 # (defaults to that of Sentence) 527 if sent.split(ParseOptions(verbosity=2)) < 0: 528 print "Cannot split sentence" 529 else 530 linkages = sent.parse() 531 print "English: found ", sent.num_valid_linkages(), "linkages" 532 for linkage in linkages: 533 print linkage.diagram() 534 """ 535 def __init__(self, text, lgdict, parse_options): 536 # Keep all args passed into clg.* functions. 537 self.text, self.dict, self.parse_options = text, lgdict, parse_options 538 clg._py_incref(self.dict) # The Sentence struct refers to the Dictionary struct 539 self._obj = clg.sentence_create(self.text, self.dict._obj) 540 541 def __del__(self): 542 if hasattr(self, '_obj'): 543 clg.sentence_delete(self._obj) 544 clg._py_decref(self.dict) 545 del self._obj 546 547 def split(self, parse_options=None): 548 """Split a sentence. If an error occurs, return a negative number.""" 549 if not parse_options: 550 parse_options = self.parse_options 551 return clg.sentence_split(self._obj, parse_options._obj) 552 553 def __len__(self): 554 """The number of tokens in the sentence.""" 555 return clg.sentence_length(self._obj) 556 557 def null_count(self): 558 """Number of null links in the linkages of this sentence.""" 559 return clg.sentence_null_count(self._obj) 560 561 class sentence_parse(object): 562 def __init__(self, sent, parse_options): 563 self.sent = sent 564 self.num = 0 565 self.parse_options = sent.parse_options if parse_options is None else parse_options 566 self.rc = clg.sentence_parse(sent._obj, self.parse_options._obj) 567 if clg.parse_options_timer_expired(self.parse_options._obj): 568 raise LG_TimerExhausted() 569 570 def __nonzero__(self): 571 """Return False if there was a split or parse error; else return True.""" 572 return self.rc >= 0 573 574 __bool__ = __nonzero__ # Account python3 575 576 def __iter__(self): 577 if 0 == clg.sentence_num_valid_linkages(self.sent._obj): 578 return iter(()) 579 return self 580 581 def __len__(self): 582 return clg.sentence_num_valid_linkages(self.sent._obj) 583 584 def next(self): 585 if self.num == clg.sentence_num_valid_linkages(self.sent._obj): 586 raise StopIteration() 587 linkage = Linkage(self.num, self.sent, self.parse_options._obj) 588 if not linkage: # SAT sentinel value 589 raise StopIteration() 590 self.num += 1 591 return linkage 592 593 __next__ = next # Account python3 594 595 def parse(self, parse_options=None): 596 return self.sentence_parse(self, parse_options) 597