1 
2 /* Web Polygraph       http://www.web-polygraph.org/
3  * Copyright 2003-2011 The Measurement Factory
4  * Licensed under the Apache License, Version 2.0 */
5 
6 #ifndef POLYGRAPH__XSTD_PREFIXIDENTIFIER_H
7 #define POLYGRAPH__XSTD_PREFIXIDENTIFIER_H
8 
9 #include <ctype.h>
10 #include "xstd/String.h"
11 #include "xstd/TokenIdentifier.h"
12 #include "xstd/gadgets.h"
13 
14 
15 // PrefixIdentifier is a TokenIdentifier with an efficient prefix-based lookup.
16 //
17 // PrefixIdentifier look-up method checks if a given buffer starts
18 // with one of the "known" strings. If it does, an "id" of the
19 // corresponding string is returned.
20 // No "known" string can be prefix of the other -- we do not
21 // want to search for the longest prefix and such...
22 // A buffer is not expected to be zero-terminated.
23 
24 // The search algorithm is very efficient. There is probably
25 // no optimal static (no caching, re-grouping) algorithm.
26 
27 // note: the search tree does not de-allocate its parts on destruction
28 
29 class PrefixIdentifier; // the user interface class, declared last
30 
31 class PrefixIdTable;
32 
33 // a node on the search tree (leaf or not)
34 class PrefixIdNode {
35 	public:
36 		PrefixIdNode();
37 
dead()38 		bool dead() const { return theId == 0; }
leaf()39 		bool leaf() const { return theId > 0; }
id()40 		int id() const { return theId; }
41 
42 		int lookup(const char *buf, int len) const;
43 		void add(String *str, int id, int myLevel);
44 
45 		void optimize();
46 
47 		void report(ostream &os, int myLevel) const;
48 
49 	protected:
50 		union {
51 			PrefixIdTable *theTab; // lookup table (intermediate nodes)
52 			String *theStr;        // known-string (leaf nodes)
53 		} u;
54 		int theId;              // id > 0 => leaf, id < 0 => intermed
55 };
56 
57 
58 // collection of nodes indexed by a char at a given search pos
59 class PrefixIdTable {
60 	public:
61 		PrefixIdTable(int aPos);
62 
63 		int lookup(const char *buf, int len) const;
64 		void add(String *str, int id);
65 
66 		void optimize();
67 		bool single(PrefixIdNode &n);
68 
69 		void report(ostream &os, int myLevel) const;
70 
71 	protected:
72 		inline int lookupIdx(const String &str) const;
73 		// returns -1 if len <= theLookupPos
74 		inline int lookupIdx(const char *buf, int len) const;
75 
76 	protected:
77 		Array<PrefixIdNode> theNodes;
78 		int theLookupPos;
79 };
80 
81 
82 // user interface: build(add everything), optimize, look-up.
83 class PrefixIdentifier: public TokenIdentifier {
84 	public:
PrefixIdentifier()85 		PrefixIdentifier(): isLocked(false) {}
86 
87 		virtual int lookup(const char *buf, int len) const;
88 		virtual int lookup(const String &str) const;
89 
90 		// call this after all additions
91 		void optimize();
92 
93 		// prints a search tree
94 		void report(ostream &os) const;
95 
96 	protected:
97 		virtual void doAdd(String &str, int id);
98 
99 	protected:
100 		PrefixIdNode theHead;
101 
102 		bool isLocked; // additions are fatal, set in optimize()
103 };
104 
105 
106 /* inlined methods */
107 
108 inline
lookupIdx(const String & str)109 int PrefixIdTable::lookupIdx(const String &str) const {
110 	return str.len() > theLookupPos ?
111 		xord(toupper(str[theLookupPos])) : theLookupPos;
112 }
113 
114 inline
lookupIdx(const char * buf,int len)115 int PrefixIdTable::lookupIdx(const char *buf, int len) const {
116 	return len > theLookupPos ?
117 		xord(toupper(buf[theLookupPos])) : -1;
118 }
119 
120 #endif
121