1# -*- coding: utf-8 -*- 2# Natural Language Toolkit: Taggers 3# 4# Copyright (C) 2001-2019 NLTK Project 5# Author: Edward Loper <edloper@gmail.com> 6# Steven Bird <stevenbird1@gmail.com> (minor additions) 7# URL: <http://nltk.org/> 8# For license information, see LICENSE.TXT 9""" 10NLTK Taggers 11 12This package contains classes and interfaces for part-of-speech 13tagging, or simply "tagging". 14 15A "tag" is a case-sensitive string that specifies some property of a token, 16such as its part of speech. Tagged tokens are encoded as tuples 17``(tag, token)``. For example, the following tagged token combines 18the word ``'fly'`` with a noun part of speech tag (``'NN'``): 19 20 >>> tagged_tok = ('fly', 'NN') 21 22An off-the-shelf tagger is available for English. It uses the Penn Treebank tagset: 23 24 >>> from nltk import pos_tag, word_tokenize 25 >>> pos_tag(word_tokenize("John's big idea isn't all that bad.")) 26 [('John', 'NNP'), ("'s", 'POS'), ('big', 'JJ'), ('idea', 'NN'), ('is', 'VBZ'), 27 ("n't", 'RB'), ('all', 'PDT'), ('that', 'DT'), ('bad', 'JJ'), ('.', '.')] 28 29A Russian tagger is also available if you specify lang="rus". It uses 30the Russian National Corpus tagset: 31 32 >>> pos_tag(word_tokenize("Илья оторопел и дважды перечитал бумажку."), lang='rus') # doctest: +SKIP 33 [('Илья', 'S'), ('оторопел', 'V'), ('и', 'CONJ'), ('дважды', 'ADV'), ('перечитал', 'V'), 34 ('бумажку', 'S'), ('.', 'NONLEX')] 35 36This package defines several taggers, which take a list of tokens, 37assign a tag to each one, and return the resulting list of tagged tokens. 38Most of the taggers are built automatically based on a training corpus. 39For example, the unigram tagger tags each word *w* by checking what 40the most frequent tag for *w* was in a training corpus: 41 42 >>> from nltk.corpus import brown 43 >>> from nltk.tag import UnigramTagger 44 >>> tagger = UnigramTagger(brown.tagged_sents(categories='news')[:500]) 45 >>> sent = ['Mitchell', 'decried', 'the', 'high', 'rate', 'of', 'unemployment'] 46 >>> for word, tag in tagger.tag(sent): 47 ... print(word, '->', tag) 48 Mitchell -> NP 49 decried -> None 50 the -> AT 51 high -> JJ 52 rate -> NN 53 of -> IN 54 unemployment -> None 55 56Note that words that the tagger has not seen during training receive a tag 57of ``None``. 58 59We evaluate a tagger on data that was not seen during training: 60 61 >>> tagger.evaluate(brown.tagged_sents(categories='news')[500:600]) 62 0.73... 63 64For more information, please consult chapter 5 of the NLTK Book. 65""" 66from __future__ import print_function 67 68from nltk.tag.api import TaggerI 69from nltk.tag.util import str2tuple, tuple2str, untag 70from nltk.tag.sequential import ( 71 SequentialBackoffTagger, 72 ContextTagger, 73 DefaultTagger, 74 NgramTagger, 75 UnigramTagger, 76 BigramTagger, 77 TrigramTagger, 78 AffixTagger, 79 RegexpTagger, 80 ClassifierBasedTagger, 81 ClassifierBasedPOSTagger, 82) 83from nltk.tag.brill import BrillTagger 84from nltk.tag.brill_trainer import BrillTaggerTrainer 85from nltk.tag.tnt import TnT 86from nltk.tag.hunpos import HunposTagger 87from nltk.tag.stanford import StanfordTagger, StanfordPOSTagger, StanfordNERTagger 88from nltk.tag.hmm import HiddenMarkovModelTagger, HiddenMarkovModelTrainer 89from nltk.tag.senna import SennaTagger, SennaChunkTagger, SennaNERTagger 90from nltk.tag.mapping import tagset_mapping, map_tag 91from nltk.tag.crf import CRFTagger 92from nltk.tag.perceptron import PerceptronTagger 93 94from nltk.data import load, find 95 96RUS_PICKLE = ( 97 'taggers/averaged_perceptron_tagger_ru/averaged_perceptron_tagger_ru.pickle' 98) 99 100 101def _get_tagger(lang=None): 102 if lang == 'rus': 103 tagger = PerceptronTagger(False) 104 ap_russian_model_loc = 'file:' + str(find(RUS_PICKLE)) 105 tagger.load(ap_russian_model_loc) 106 else: 107 tagger = PerceptronTagger() 108 return tagger 109 110 111def _pos_tag(tokens, tagset=None, tagger=None, lang=None): 112 # Currently only supoorts English and Russian. 113 if lang not in ['eng', 'rus']: 114 raise NotImplementedError( 115 "Currently, NLTK pos_tag only supports English and Russian " 116 "(i.e. lang='eng' or lang='rus')" 117 ) 118 else: 119 tagged_tokens = tagger.tag(tokens) 120 if tagset: # Maps to the specified tagset. 121 if lang == 'eng': 122 tagged_tokens = [ 123 (token, map_tag('en-ptb', tagset, tag)) 124 for (token, tag) in tagged_tokens 125 ] 126 elif lang == 'rus': 127 # Note that the new Russion pos tags from the model contains suffixes, 128 # see https://github.com/nltk/nltk/issues/2151#issuecomment-430709018 129 tagged_tokens = [ 130 (token, map_tag('ru-rnc-new', tagset, tag.partition('=')[0])) 131 for (token, tag) in tagged_tokens 132 ] 133 return tagged_tokens 134 135 136def pos_tag(tokens, tagset=None, lang='eng'): 137 """ 138 Use NLTK's currently recommended part of speech tagger to 139 tag the given list of tokens. 140 141 >>> from nltk.tag import pos_tag 142 >>> from nltk.tokenize import word_tokenize 143 >>> pos_tag(word_tokenize("John's big idea isn't all that bad.")) 144 [('John', 'NNP'), ("'s", 'POS'), ('big', 'JJ'), ('idea', 'NN'), ('is', 'VBZ'), 145 ("n't", 'RB'), ('all', 'PDT'), ('that', 'DT'), ('bad', 'JJ'), ('.', '.')] 146 >>> pos_tag(word_tokenize("John's big idea isn't all that bad."), tagset='universal') 147 [('John', 'NOUN'), ("'s", 'PRT'), ('big', 'ADJ'), ('idea', 'NOUN'), ('is', 'VERB'), 148 ("n't", 'ADV'), ('all', 'DET'), ('that', 'DET'), ('bad', 'ADJ'), ('.', '.')] 149 150 NB. Use `pos_tag_sents()` for efficient tagging of more than one sentence. 151 152 :param tokens: Sequence of tokens to be tagged 153 :type tokens: list(str) 154 :param tagset: the tagset to be used, e.g. universal, wsj, brown 155 :type tagset: str 156 :param lang: the ISO 639 code of the language, e.g. 'eng' for English, 'rus' for Russian 157 :type lang: str 158 :return: The tagged tokens 159 :rtype: list(tuple(str, str)) 160 """ 161 tagger = _get_tagger(lang) 162 return _pos_tag(tokens, tagset, tagger, lang) 163 164 165def pos_tag_sents(sentences, tagset=None, lang='eng'): 166 """ 167 Use NLTK's currently recommended part of speech tagger to tag the 168 given list of sentences, each consisting of a list of tokens. 169 170 :param tokens: List of sentences to be tagged 171 :type tokens: list(list(str)) 172 :param tagset: the tagset to be used, e.g. universal, wsj, brown 173 :type tagset: str 174 :param lang: the ISO 639 code of the language, e.g. 'eng' for English, 'rus' for Russian 175 :type lang: str 176 :return: The list of tagged sentences 177 :rtype: list(list(tuple(str, str))) 178 """ 179 tagger = _get_tagger(lang) 180 return [_pos_tag(sent, tagset, tagger, lang) for sent in sentences] 181