1# -*- coding: utf-8 -*-
2# Natural Language Toolkit: Taggers
3#
4# Copyright (C) 2001-2019 NLTK Project
5# Author: Edward Loper <edloper@gmail.com>
6#         Steven Bird <stevenbird1@gmail.com> (minor additions)
7# URL: <http://nltk.org/>
8# For license information, see LICENSE.TXT
9"""
10NLTK Taggers
11
12This package contains classes and interfaces for part-of-speech
13tagging, or simply "tagging".
14
15A "tag" is a case-sensitive string that specifies some property of a token,
16such as its part of speech.  Tagged tokens are encoded as tuples
17``(tag, token)``.  For example, the following tagged token combines
18the word ``'fly'`` with a noun part of speech tag (``'NN'``):
19
20    >>> tagged_tok = ('fly', 'NN')
21
22An off-the-shelf tagger is available for English. It uses the Penn Treebank tagset:
23
24    >>> from nltk import pos_tag, word_tokenize
25    >>> pos_tag(word_tokenize("John's big idea isn't all that bad."))
26    [('John', 'NNP'), ("'s", 'POS'), ('big', 'JJ'), ('idea', 'NN'), ('is', 'VBZ'),
27    ("n't", 'RB'), ('all', 'PDT'), ('that', 'DT'), ('bad', 'JJ'), ('.', '.')]
28
29A Russian tagger is also available if you specify lang="rus". It uses
30the Russian National Corpus tagset:
31
32    >>> pos_tag(word_tokenize("Илья оторопел и дважды перечитал бумажку."), lang='rus')    # doctest: +SKIP
33    [('Илья', 'S'), ('оторопел', 'V'), ('и', 'CONJ'), ('дважды', 'ADV'), ('перечитал', 'V'),
34    ('бумажку', 'S'), ('.', 'NONLEX')]
35
36This package defines several taggers, which take a list of tokens,
37assign a tag to each one, and return the resulting list of tagged tokens.
38Most of the taggers are built automatically based on a training corpus.
39For example, the unigram tagger tags each word *w* by checking what
40the most frequent tag for *w* was in a training corpus:
41
42    >>> from nltk.corpus import brown
43    >>> from nltk.tag import UnigramTagger
44    >>> tagger = UnigramTagger(brown.tagged_sents(categories='news')[:500])
45    >>> sent = ['Mitchell', 'decried', 'the', 'high', 'rate', 'of', 'unemployment']
46    >>> for word, tag in tagger.tag(sent):
47    ...     print(word, '->', tag)
48    Mitchell -> NP
49    decried -> None
50    the -> AT
51    high -> JJ
52    rate -> NN
53    of -> IN
54    unemployment -> None
55
56Note that words that the tagger has not seen during training receive a tag
57of ``None``.
58
59We evaluate a tagger on data that was not seen during training:
60
61    >>> tagger.evaluate(brown.tagged_sents(categories='news')[500:600])
62    0.73...
63
64For more information, please consult chapter 5 of the NLTK Book.
65"""
66from __future__ import print_function
67
68from nltk.tag.api import TaggerI
69from nltk.tag.util import str2tuple, tuple2str, untag
70from nltk.tag.sequential import (
71    SequentialBackoffTagger,
72    ContextTagger,
73    DefaultTagger,
74    NgramTagger,
75    UnigramTagger,
76    BigramTagger,
77    TrigramTagger,
78    AffixTagger,
79    RegexpTagger,
80    ClassifierBasedTagger,
81    ClassifierBasedPOSTagger,
82)
83from nltk.tag.brill import BrillTagger
84from nltk.tag.brill_trainer import BrillTaggerTrainer
85from nltk.tag.tnt import TnT
86from nltk.tag.hunpos import HunposTagger
87from nltk.tag.stanford import StanfordTagger, StanfordPOSTagger, StanfordNERTagger
88from nltk.tag.hmm import HiddenMarkovModelTagger, HiddenMarkovModelTrainer
89from nltk.tag.senna import SennaTagger, SennaChunkTagger, SennaNERTagger
90from nltk.tag.mapping import tagset_mapping, map_tag
91from nltk.tag.crf import CRFTagger
92from nltk.tag.perceptron import PerceptronTagger
93
94from nltk.data import load, find
95
96RUS_PICKLE = (
97    'taggers/averaged_perceptron_tagger_ru/averaged_perceptron_tagger_ru.pickle'
98)
99
100
101def _get_tagger(lang=None):
102    if lang == 'rus':
103        tagger = PerceptronTagger(False)
104        ap_russian_model_loc = 'file:' + str(find(RUS_PICKLE))
105        tagger.load(ap_russian_model_loc)
106    else:
107        tagger = PerceptronTagger()
108    return tagger
109
110
111def _pos_tag(tokens, tagset=None, tagger=None, lang=None):
112    # Currently only supoorts English and Russian.
113    if lang not in ['eng', 'rus']:
114        raise NotImplementedError(
115            "Currently, NLTK pos_tag only supports English and Russian "
116            "(i.e. lang='eng' or lang='rus')"
117        )
118    else:
119        tagged_tokens = tagger.tag(tokens)
120        if tagset:  # Maps to the specified tagset.
121            if lang == 'eng':
122                tagged_tokens = [
123                    (token, map_tag('en-ptb', tagset, tag))
124                    for (token, tag) in tagged_tokens
125                ]
126            elif lang == 'rus':
127                # Note that the new Russion pos tags from the model contains suffixes,
128                # see https://github.com/nltk/nltk/issues/2151#issuecomment-430709018
129                tagged_tokens = [
130                    (token, map_tag('ru-rnc-new', tagset, tag.partition('=')[0]))
131                    for (token, tag) in tagged_tokens
132                ]
133        return tagged_tokens
134
135
136def pos_tag(tokens, tagset=None, lang='eng'):
137    """
138    Use NLTK's currently recommended part of speech tagger to
139    tag the given list of tokens.
140
141        >>> from nltk.tag import pos_tag
142        >>> from nltk.tokenize import word_tokenize
143        >>> pos_tag(word_tokenize("John's big idea isn't all that bad."))
144        [('John', 'NNP'), ("'s", 'POS'), ('big', 'JJ'), ('idea', 'NN'), ('is', 'VBZ'),
145        ("n't", 'RB'), ('all', 'PDT'), ('that', 'DT'), ('bad', 'JJ'), ('.', '.')]
146        >>> pos_tag(word_tokenize("John's big idea isn't all that bad."), tagset='universal')
147        [('John', 'NOUN'), ("'s", 'PRT'), ('big', 'ADJ'), ('idea', 'NOUN'), ('is', 'VERB'),
148        ("n't", 'ADV'), ('all', 'DET'), ('that', 'DET'), ('bad', 'ADJ'), ('.', '.')]
149
150    NB. Use `pos_tag_sents()` for efficient tagging of more than one sentence.
151
152    :param tokens: Sequence of tokens to be tagged
153    :type tokens: list(str)
154    :param tagset: the tagset to be used, e.g. universal, wsj, brown
155    :type tagset: str
156    :param lang: the ISO 639 code of the language, e.g. 'eng' for English, 'rus' for Russian
157    :type lang: str
158    :return: The tagged tokens
159    :rtype: list(tuple(str, str))
160    """
161    tagger = _get_tagger(lang)
162    return _pos_tag(tokens, tagset, tagger, lang)
163
164
165def pos_tag_sents(sentences, tagset=None, lang='eng'):
166    """
167    Use NLTK's currently recommended part of speech tagger to tag the
168    given list of sentences, each consisting of a list of tokens.
169
170    :param tokens: List of sentences to be tagged
171    :type tokens: list(list(str))
172    :param tagset: the tagset to be used, e.g. universal, wsj, brown
173    :type tagset: str
174    :param lang: the ISO 639 code of the language, e.g. 'eng' for English, 'rus' for Russian
175    :type lang: str
176    :return: The list of tagged sentences
177    :rtype: list(list(tuple(str, str)))
178    """
179    tagger = _get_tagger(lang)
180    return [_pos_tag(sent, tagset, tagger, lang) for sent in sentences]
181