1 // Copyright (c) 1994 James Clark
2 // See the file COPYING for copying permission.
3 
4 #ifdef __GNUG__
5 #pragma implementation
6 #endif
7 #include "splib.h"
8 #include "types.h"
9 #include "macros.h"
10 #include "StringOf.h"
11 #include "Trie.h"
12 #include "TrieBuilder.h"
13 #include "Priority.h"
14 #include <stdlib.h>
15 
16 #ifdef SP_NAMESPACE
17 namespace SP_NAMESPACE {
18 #endif
19 
~Trie()20 Trie::~Trie()
21 {
22   if (next_)
23     delete [] next_;
24 }
25 
Trie(const Trie & t)26 Trie::Trie(const Trie &t)
27 : nCodes_(t.nCodes_),
28   token_(t.token_),
29   tokenLength_(t.tokenLength_),
30   priority_(t.priority_),
31   blank_(t.blank_)
32 {
33   if (t.next_) {
34     next_ = new Trie[nCodes_];
35     for (int i = 0; i < nCodes_; i++)
36       next_[i] = t.next_[i];
37   }
38   else
39     next_ = 0;
40 }
41 
operator =(const Trie & t)42 Trie &Trie::operator=(const Trie &t)
43 {
44   if (next_)
45     delete [] next_;
46   nCodes_ = t.nCodes_;
47   token_ = t.token_;
48   tokenLength_ = t.tokenLength_;
49   priority_ = t.priority_;
50   blank_ = t.blank_;
51   if (t.next_) {
52     next_ = new Trie[nCodes_];
53     for (int i = 0; i < nCodes_; i++)
54       next_[i] = t.next_[i];
55   }
56   else
57     next_ = 0;
58   return *this;
59 }
60 
TrieBuilder(int nCodes)61 TrieBuilder::TrieBuilder(int nCodes)
62 : nCodes_(nCodes), root_(new Trie)
63 {
64   root_->token_ = 0;
65   root_->tokenLength_ = 0;
66   root_->priority_ = Priority::data;
67   root_->nCodes_ = nCodes;
68 }
69 
recognize(const String<EquivCode> & chars,Token t,Priority::Type priority,TokenVector & ambiguities)70 void TrieBuilder::recognize(const String<EquivCode> &chars,
71 			    Token t,
72 			    Priority::Type priority,
73 			    TokenVector &ambiguities)
74 {
75   setToken(extendTrie(root_.pointer(), chars), chars.size(), t, priority,
76 	   ambiguities);
77 }
78 
recognize(const String<EquivCode> & chars,const String<EquivCode> & set,Token t,Priority::Type priority,TokenVector & ambiguities)79 void TrieBuilder::recognize(const String<EquivCode> &chars,
80 			    const String<EquivCode> &set,
81 			    Token t,
82 			    Priority::Type priority,
83 			    TokenVector &ambiguities)
84 {
85   Trie *trie = extendTrie(root_.pointer(), chars);
86 
87   for (size_t i = 0; i < set.size(); i++)
88     setToken(forceNext(trie, set[i]), chars.size() + 1, t, priority,
89 	     ambiguities);
90 }
91 
recognizeB(const String<EquivCode> & chars,int bSequenceLength,size_t maxBlankSequence,const String<EquivCode> & blankCodes,const String<EquivCode> & chars2,Token token,TokenVector & ambiguities)92 void TrieBuilder::recognizeB(const String<EquivCode> &chars,
93 			     int bSequenceLength,
94 			     size_t maxBlankSequence,
95 			     const String<EquivCode> &blankCodes,
96 			     const String<EquivCode> &chars2,
97 			     Token token,
98 			     TokenVector &ambiguities)
99 {
100   doB(extendTrie(root_.pointer(), chars),
101       chars.size(),
102       bSequenceLength,
103       maxBlankSequence,
104       blankCodes,
105       chars2,
106       token,
107       Priority::blank(bSequenceLength),
108       ambiguities);
109 }
110 
recognizeEE(EquivCode code,Token t)111 void TrieBuilder::recognizeEE(EquivCode code, Token t)
112 {
113   Trie *trie = forceNext(root_.pointer(), code);
114   trie->tokenLength_ = 0;	// it has length 0 in the buffer
115   trie->token_ = t;
116   trie->priority_ = Priority::data;
117 }
118 
doB(Trie * trie,int tokenLength,int minBLength,size_t maxLength,const String<EquivCode> & blankCodes,const String<EquivCode> & chars2,Token token,Priority::Type pri,TokenVector & ambiguities)119 void TrieBuilder::doB(Trie *trie,
120 		      int tokenLength,
121 		      int minBLength,
122 		      size_t maxLength,
123 		      const String<EquivCode> &blankCodes,
124 		      const String<EquivCode> &chars2,
125 		      Token token,
126 		      Priority::Type pri,
127 		      TokenVector &ambiguities)
128 {
129   if (minBLength == 0 && trie->next_ == 0) {
130     if (!trie->blank_) {
131       BlankTrie *b = new BlankTrie;
132       trie->blank_ = b;
133       b->maxBlanksToScan_ = maxLength;
134       b->additionalLength_ = tokenLength;
135       b->codeIsBlank_.assign(nCodes_, 0);
136       for (size_t i = 0; i < blankCodes.size(); i++)
137 	b->codeIsBlank_[blankCodes[i]] = 1;
138       b->tokenLength_ = 0;
139       b->token_ = 0;
140       b->priority_ = Priority::data;
141       b->nCodes_ = nCodes_;
142     }
143     else {
144       // A B sequence is not allowed to be adjacent to a character
145       // that can occur in a blank sequence, so maxLength will be
146       // the same at a node, no matter how we got there.
147       ASSERT(trie->blank_->maxBlanksToScan_ == maxLength);
148       ASSERT(trie->blank_->additionalLength_ == tokenLength);
149     }
150     if (chars2.size() == 0)
151       setToken(trie, tokenLength, token, pri, ambiguities);
152     else
153       setToken(extendTrie(trie->blank_.pointer(), chars2),
154 	       chars2.size(),
155 	       token,
156 	       pri,
157 	       ambiguities);
158   }
159   else {
160     if (minBLength == 0)
161       setToken(extendTrie(trie, chars2), tokenLength + chars2.size(),
162 	       token, pri, ambiguities);
163     for (size_t i = 0; i < blankCodes.size(); i++)
164       doB(forceNext(trie, blankCodes[i]),
165 	  tokenLength + 1,
166 	  minBLength == 0 ? 0 : minBLength - 1,
167 	  maxLength - 1,
168 	  blankCodes,
169 	  chars2,
170 	  token,
171 	  pri,
172 	  ambiguities);
173   }
174 }
175 
extendTrie(Trie * trie,const String<EquivCode> & s)176 Trie *TrieBuilder::extendTrie(Trie *trie, const String<EquivCode> &s)
177 {
178   for (size_t i = 0; i < s.size(); i++)
179     trie = forceNext(trie, s[i]);
180   return trie;
181 }
182 
setToken(Trie * trie,int tokenLength,Token token,Priority::Type pri,TokenVector & ambiguities)183 void TrieBuilder::setToken(Trie *trie,
184 			   int tokenLength,
185 			   Token token,
186 			   Priority::Type pri,
187 			   TokenVector &ambiguities)
188 {
189   if (tokenLength > trie->tokenLength_
190       || (tokenLength == trie->tokenLength_
191 	  && pri > trie->priority_)) {
192     trie->tokenLength_ = tokenLength;
193     trie->token_ = token;
194     trie->priority_ = pri;
195   }
196   else if (trie->tokenLength_ == tokenLength
197 	   && trie->priority_ == pri
198 	   && trie->token_ != token
199 	   && trie->token_ != 0) {
200     ambiguities.push_back(Token(trie->token_));
201     ambiguities.push_back(token);
202   }
203   if (trie->hasNext()) {
204     for (int i = 0; i < nCodes_; i++)
205       setToken(&trie->next_[i], tokenLength, token, pri, ambiguities);
206   }
207 }
208 
copyInto(Trie * into,const Trie * from,int additionalLength)209 void TrieBuilder::copyInto(Trie *into, const Trie *from, int additionalLength)
210 {
211   if (from->token_ != 0) {
212     TokenVector ambiguities;
213     setToken(into, from->tokenLength_ + additionalLength, from->token_,
214 	     from->priority_, ambiguities);
215     ASSERT(ambiguities.size() == 0);
216   }
217   if (from->hasNext())
218     for (int i = 0; i < nCodes_; i++)
219       copyInto(forceNext(into, i), &from->next_[i], additionalLength);
220 }
221 
forceNext(Trie * trie,EquivCode c)222 Trie *TrieBuilder::forceNext(Trie *trie, EquivCode c)
223 {
224   if (!trie->hasNext()) {
225     trie->next_ = new Trie[nCodes_];
226     if (trie->blank_) {
227       trie->blank_->additionalLength_ += 1;
228       trie->blank_->maxBlanksToScan_ -= 1;
229     }
230     Owner<BlankTrie> blankOwner(trie->blank_.extract());
231     const BlankTrie *b = blankOwner.pointer();
232     for (int i = 0; i < nCodes_; i++) {
233       Trie *p = &trie->next_[i];
234       if (b && b->codeIsBlank(i))
235 	trie->next_[i].blank_ = (blankOwner
236 				 ? blankOwner.extract()
237 				 : new BlankTrie(*b));
238       p->token_ = trie->token_;
239       p->tokenLength_ = trie->tokenLength_;
240       p->priority_ = trie->priority_;
241       p->nCodes_ = nCodes_;
242     }
243     if (b)
244       // -1 because 1 was added above
245       copyInto(trie, b, b->additionalLength_ - 1);
246   }
247   return &trie->next_[c];
248 }
249 
250 #ifdef SP_NAMESPACE
251 }
252 #endif
253