1 // Copyright 2017 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS-IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 
16 // Author: ericv@google.com (Eric Veach)
17 //
18 // S2RegionTermIndexer is a helper class for adding spatial data to an
19 // information retrieval system.  Such systems work by converting documents
20 // into a collection of "index terms" (e.g., representing words or phrases),
21 // and then building an "inverted index" that maps each term to a list of
22 // documents (and document positions) where that term occurs.
23 //
24 // This class deals with the problem of converting spatial data into index
25 // terms, which can then be indexed along with the other document information.
26 //
27 // Spatial data is represented using the S2Region type.  Useful S2Region
28 // subtypes include:
29 //
30 //   S2Cap
31 //    - a disc-shaped region
32 //
33 //   S2LatLngRect
34 //    - a rectangle in latitude-longitude coordinates
35 //
36 //   S2Polyline
37 //    - a polyline
38 //
39 //   S2Polygon
40 //    - a polygon, possibly with multiple holes and/or shells
41 //
42 //   S2CellUnion
43 //    - a region approximated as a collection of S2CellIds
44 //
45 //   S2ShapeIndexRegion
46 //    - an arbitrary collection of points, polylines, and polygons
47 //
48 //   S2ShapeIndexBufferedRegion
49 //    - like the above, but expanded by a given radius
50 //
51 //   S2RegionUnion, S2RegionIntersection
52 //    - the union or intersection of arbitrary other regions
53 //
54 // So for example, if you want to query documents that are within 500 meters
55 // of a polyline, you could use an S2ShapeIndexBufferedRegion containing the
56 // polyline with a radius of 500 meters.
57 //
58 // Example usage:
59 //
60 //   // This class is intended to be used with an external key-value store,
61 //   // but for this example will we use an unordered_map.  The key is an
62 //   // index term, and the value is a set of document ids.
63 //   std::unordered_map<string, std::vector<int>> index;
64 //
65 //   // Create an indexer that uses up to 10 cells to approximate each region.
66 //   S2RegionTermIndexer::Options options;
67 //   options.set_max_cells(10);
68 //   S2RegionTermIndexer indexer(options);
69 //
70 //   // For this example, we index a disc-shaped region with a 10km radius.
71 //   S2LatLng center = S2LatLng::FromDegrees(44.1, -56.235);
72 //   S1Angle radius = S2Earth::ToAngle(util::units::Kilometers(10.0));
73 //   S2Cap cap(center.ToPoint(), radius);
74 //
75 //   // Add the terms for this disc-shaped region to the index.
76 //   for (const auto& term : indexer.GetIndexTerms(cap)) {
77 //     index[term].push_back(kSomeDocumentId);
78 //   }
79 //
80 //   // And now at query time: build a latitude-longitude rectangle.
81 //   S2LatLngRect rect(S2LatLng::FromDegrees(-12.1, 10.2),
82 //                     S2LatLng::FromDegrees(-9.2,  120.5));
83 //
84 //   // Convert the query region to a set of terms, and compute the union
85 //   // of the document ids associated with those terms.
86 //   std::set<int> doc_ids;
87 //   for (const auto& term : indexer.GetQueryTerms(rect)) {
88 //     doc_ids.insert(index[term].begin(), index[term].end());
89 //   }
90 //
91 //   // "doc_ids" now contains all documents that intersect the query region,
92 //   // along with some documents that nearly intersect it.  The results can
93 //   // be further pruned if desired by retrieving the original regions that
94 //   // were indexed (i.e., the document contents) and checking for exact
95 //   // intersection with the query region.
96 
97 #ifndef S2_S2REGION_TERM_INDEXER_H_
98 #define S2_S2REGION_TERM_INDEXER_H_
99 
100 #include <string>
101 #include <vector>
102 
103 #include "s2/s2cell_union.h"
104 #include "s2/s2region.h"
105 #include "s2/s2region_coverer.h"
106 #include "s2/third_party/absl/strings/string_view.h"
107 
108 class S2RegionTermIndexer {
109  public:
110   // The following parameters control the tradeoffs between index size, query
111   // size, and accuracy (see s2region_coverer.h for details).
112   //
113   // IMPORTANT: You must use the same values for min_level(), max_level(), and
114   // level_mod() for both indexing and queries, otherwise queries will return
115   // incorrect results.  However, max_cells() can be changed as often as
116   // desired -- you can even change this parameter for every region.
117 
118   class Options : public S2RegionCoverer::Options {
119    public:
120     Options();
121 
122     ///////////////// Options Inherited From S2RegionCoverer ////////////////
123 
124     // max_cells() controls the maximum number of cells when approximating
125     // each region.  This parameter value may be changed as often as desired
126     // (using mutable_options(), see below), e.g. to approximate some regions
127     // more accurately than others.
128     //
129     // Increasing this value during indexing will make indexes more accurate
130     // but larger.  Increasing this value for queries will make queries more
131     // accurate but slower.  (See s2region_coverer.h for details on how this
132     // parameter affects accuracy.)  For example, if you don't mind large
133     // indexes but want fast serving, it might be reasonable to set
134     // max_cells() == 100 during indexing and max_cells() == 8 for queries.
135     //
136     // DEFAULT: 8  (coarse approximations)
137     using S2RegionCoverer::Options::max_cells;
138     using S2RegionCoverer::Options::set_max_cells;
139 
140     // min_level() and max_level() control the minimum and maximum size of the
141     // S2Cells used to approximate regions.  Setting these parameters
142     // appropriately can reduce the size of the index and speed up queries by
143     // reducing the number of terms needed.  For example, if you know that
144     // your query regions will rarely be less than 100 meters in width, then
145     // you could set max_level() as follows:
146     //
147     //   options.set_max_level(S2::kAvgEdge.GetClosestLevel(
148     //       S2Earth::MetersToRadians(100)));
149     //
150     // This restricts the index to S2Cells that are approximately 100 meters
151     // across or larger.  Similar, if you know that query regions will rarely
152     // be larger than 1000km across, then you could set min_level() similarly.
153     //
154     // If min_level() is set too high, then large regions may generate too
155     // many query terms.  If max_level() is set too low, then small query
156     // regions will not be able to discriminate which regions they intersect
157     // very precisely and may return many more candidates than necessary.
158     //
159     // If you have no idea about the scale of the regions being queried,
160     // it is perfectly fine to set min_level() == 0 and max_level() == 30
161     // (== S2::kMaxLevel).  The only drawback is that may result in a larger
162     // index and slower queries.
163     //
164     // The default parameter values are suitable for query regions ranging
165     // from about 100 meters to 3000 km across.
166     //
167     // DEFAULT: 4  (average cell width == 600km)
168     using S2RegionCoverer::Options::min_level;
169     using S2RegionCoverer::Options::set_min_level;
170 
171     // DEFAULT: 16 (average cell width == 150m)
172     using S2RegionCoverer::Options::max_level;
173     using S2RegionCoverer::Options::set_max_level;
174 
175     // Setting level_mod() to a value greater than 1 increases the effective
176     // branching factor of the S2Cell hierarchy by skipping some levels.  For
177     // example, if level_mod() == 2 then every second level is skipped (which
178     // increases the effective branching factor to 16).  You might want to
179     // consider doing this if your query regions are typically very small
180     // (e.g., single points) and you don't mind increasing the index size
181     // (since skipping levels will reduce the accuracy of cell coverings for a
182     // given max_cells() limit).
183     //
184     // DEFAULT: 1  (don't skip any cell levels)
185     using S2RegionCoverer::Options::level_mod;
186     using S2RegionCoverer::Options::set_level_mod;
187 
188     // If your index will only contain points (rather than regions), be sure
189     // to set this flag.  This will generate smaller and faster queries that
190     // are specialized for the points-only case.
191     //
192     // With the default quality settings, this flag reduces the number of
193     // query terms by about a factor of two.  (The improvement gets smaller
194     // as max_cells() is increased, but there is really no reason not to use
195     // this flag if your index consists entirely of points.)
196     //
197     // DEFAULT: false
index_contains_points_only()198     bool index_contains_points_only() const { return points_only_; }
set_index_contains_points_only(bool value)199     void set_index_contains_points_only(bool value) { points_only_ = value; }
200 
201     // If true, the index will be optimized for space rather than for query
202     // time.  With the default quality settings, this flag reduces the number
203     // of index terms and increases the number of query terms by the same
204     // factor (approximately 1.3).  The factor increases up to a limiting
205     // ratio of 2.0 as max_cells() is increased.
206     //
207     // CAVEAT: This option has no effect if the index contains only points.
208     //
209     // DEFAULT: false
optimize_for_space()210     bool optimize_for_space() const { return optimize_for_space_; }
set_optimize_for_space(bool value)211     void set_optimize_for_space(bool value) { optimize_for_space_ = value; }
212 
213     // A non-alphanumeric character that is used internally to distinguish
214     // between two different types of terms (by adding this character).
215     //
216     // REQUIRES: "ch" is non-alphanumeric.
217     // DEFAULT: '$'
marker()218     const string& marker() const { return marker_; }
marker_character()219     char marker_character() const { return marker_[0]; }
220     void set_marker_character(char ch);
221 
222    private:
223     bool points_only_ = false;
224     bool optimize_for_space_ = false;
225     string marker_ = string(1, '$');
226   };
227 
228   // Default constructor.  Options can be set using mutable_options().
229   S2RegionTermIndexer();
230   ~S2RegionTermIndexer();
231 
232   // Constructs an S2RegionTermIndexer with the given options.
233   explicit S2RegionTermIndexer(const Options& options);
234 
235   // S2RegionTermIndexer is movable but not copyable.
236   S2RegionTermIndexer(const S2RegionTermIndexer&) = delete;
237   S2RegionTermIndexer& operator=(const S2RegionTermIndexer&) = delete;
238   S2RegionTermIndexer(S2RegionTermIndexer&&);
239   S2RegionTermIndexer& operator=(S2RegionTermIndexer&&);
240 
241   // Returns the current options.  Options can be modifed between calls.
options()242   const Options& options() const { return options_; }
mutable_options()243   Options* mutable_options() { return &options_; }
244 
245   // Converts the given region into a set of terms for indexing.  Terms
246   // consist of lowercase letters, numbers, '$', and an optional prefix.
247   //
248   // "prefix" is a unique prefix used to distinguish S2 terms from other terms
249   // in the repository.  The prefix may also be used to index documents with
250   // multiple types of location information (e.g. store footprint, entrances,
251   // parking lots, etc).  The prefix should be kept short since it is
252   // prepended to every term.
253   std::vector<string> GetIndexTerms(const S2Region& region,
254                                     absl::string_view prefix);
255 
256   // Converts a given query region into a set of terms.  If you compute the
257   // union of all the documents associated with these terms, the result will
258   // include all documents whose index region intersects the query region.
259   //
260   // "prefix" should match the corresponding value used when indexing.
261   std::vector<string> GetQueryTerms(const S2Region& region,
262                                     absl::string_view prefix);
263 
264   // Convenience methods that accept an S2Point rather than S2Region.  (These
265   // methods are also faster.)
266   //
267   // Note that you can index an S2LatLng by converting it to an S2Point first:
268   //     auto terms = GetIndexTerms(S2Point(latlng), ...);
269   std::vector<string> GetIndexTerms(const S2Point& point,
270                                     absl::string_view prefix);
271   std::vector<string> GetQueryTerms(const S2Point& point,
272                                     absl::string_view prefix);
273 
274   // Low-level methods that accept an S2CellUnion covering of the region to be
275   // indexed or queried.
276   //
277   // REQUIRES: "covering" satisfies the S2RegionCoverer::Options for this
278   //           class (i.e., max_cells, min_level, max_level, and level_mod).
279   //
280   // If you have a covering that was computed using different options, then
281   // you can either call the regular S2Region methods (since S2CellUnion is a
282   // type of S2Region), or "canonicalize" the covering first by calling
283   // S2RegionCoverer::CanonicalizeCovering() with the same options.
284   std::vector<string> GetIndexTermsForCanonicalCovering(
285       const S2CellUnion& covering, absl::string_view prefix);
286   std::vector<string> GetQueryTermsForCanonicalCovering(
287       const S2CellUnion& covering, absl::string_view prefix);
288 
289  private:
290   enum TermType { ANCESTOR, COVERING };
291 
292   string GetTerm(TermType term_type, const S2CellId& id,
293                  absl::string_view prefix) const;
294 
295   Options options_;
296   S2RegionCoverer coverer_;
297 };
298 
299 #endif  // S2_S2REGION_TERM_INDEXER_H_
300