1
2 /* Web Polygraph http://www.web-polygraph.org/
3 * Copyright 2003-2011 The Measurement Factory
4 * Licensed under the Apache License, Version 2.0 */
5
6 #ifndef POLYGRAPH__XSTD_PREFIXIDENTIFIER_H
7 #define POLYGRAPH__XSTD_PREFIXIDENTIFIER_H
8
9 #include <ctype.h>
10 #include "xstd/String.h"
11 #include "xstd/TokenIdentifier.h"
12 #include "xstd/gadgets.h"
13
14
15 // PrefixIdentifier is a TokenIdentifier with an efficient prefix-based lookup.
16 //
17 // PrefixIdentifier look-up method checks if a given buffer starts
18 // with one of the "known" strings. If it does, an "id" of the
19 // corresponding string is returned.
20 // No "known" string can be prefix of the other -- we do not
21 // want to search for the longest prefix and such...
22 // A buffer is not expected to be zero-terminated.
23
24 // The search algorithm is very efficient. There is probably
25 // no optimal static (no caching, re-grouping) algorithm.
26
27 // note: the search tree does not de-allocate its parts on destruction
28
29 class PrefixIdentifier; // the user interface class, declared last
30
31 class PrefixIdTable;
32
33 // a node on the search tree (leaf or not)
34 class PrefixIdNode {
35 public:
36 PrefixIdNode();
37
dead()38 bool dead() const { return theId == 0; }
leaf()39 bool leaf() const { return theId > 0; }
id()40 int id() const { return theId; }
41
42 int lookup(const char *buf, int len) const;
43 void add(String *str, int id, int myLevel);
44
45 void optimize();
46
47 void report(ostream &os, int myLevel) const;
48
49 protected:
50 union {
51 PrefixIdTable *theTab; // lookup table (intermediate nodes)
52 String *theStr; // known-string (leaf nodes)
53 } u;
54 int theId; // id > 0 => leaf, id < 0 => intermed
55 };
56
57
58 // collection of nodes indexed by a char at a given search pos
59 class PrefixIdTable {
60 public:
61 PrefixIdTable(int aPos);
62
63 int lookup(const char *buf, int len) const;
64 void add(String *str, int id);
65
66 void optimize();
67 bool single(PrefixIdNode &n);
68
69 void report(ostream &os, int myLevel) const;
70
71 protected:
72 inline int lookupIdx(const String &str) const;
73 // returns -1 if len <= theLookupPos
74 inline int lookupIdx(const char *buf, int len) const;
75
76 protected:
77 Array<PrefixIdNode> theNodes;
78 int theLookupPos;
79 };
80
81
82 // user interface: build(add everything), optimize, look-up.
83 class PrefixIdentifier: public TokenIdentifier {
84 public:
PrefixIdentifier()85 PrefixIdentifier(): isLocked(false) {}
86
87 virtual int lookup(const char *buf, int len) const;
88 virtual int lookup(const String &str) const;
89
90 // call this after all additions
91 void optimize();
92
93 // prints a search tree
94 void report(ostream &os) const;
95
96 protected:
97 virtual void doAdd(String &str, int id);
98
99 protected:
100 PrefixIdNode theHead;
101
102 bool isLocked; // additions are fatal, set in optimize()
103 };
104
105
106 /* inlined methods */
107
108 inline
lookupIdx(const String & str)109 int PrefixIdTable::lookupIdx(const String &str) const {
110 return str.len() > theLookupPos ?
111 xord(toupper(str[theLookupPos])) : theLookupPos;
112 }
113
114 inline
lookupIdx(const char * buf,int len)115 int PrefixIdTable::lookupIdx(const char *buf, int len) const {
116 return len > theLookupPos ?
117 xord(toupper(buf[theLookupPos])) : -1;
118 }
119
120 #endif
121