1 // Copyright 2017 Google Inc. All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS-IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 // 15 16 // Author: ericv@google.com (Eric Veach) 17 // 18 // S2RegionTermIndexer is a helper class for adding spatial data to an 19 // information retrieval system. Such systems work by converting documents 20 // into a collection of "index terms" (e.g., representing words or phrases), 21 // and then building an "inverted index" that maps each term to a list of 22 // documents (and document positions) where that term occurs. 23 // 24 // This class deals with the problem of converting spatial data into index 25 // terms, which can then be indexed along with the other document information. 26 // 27 // Spatial data is represented using the S2Region type. Useful S2Region 28 // subtypes include: 29 // 30 // S2Cap 31 // - a disc-shaped region 32 // 33 // S2LatLngRect 34 // - a rectangle in latitude-longitude coordinates 35 // 36 // S2Polyline 37 // - a polyline 38 // 39 // S2Polygon 40 // - a polygon, possibly with multiple holes and/or shells 41 // 42 // S2CellUnion 43 // - a region approximated as a collection of S2CellIds 44 // 45 // S2ShapeIndexRegion 46 // - an arbitrary collection of points, polylines, and polygons 47 // 48 // S2ShapeIndexBufferedRegion 49 // - like the above, but expanded by a given radius 50 // 51 // S2RegionUnion, S2RegionIntersection 52 // - the union or intersection of arbitrary other regions 53 // 54 // So for example, if you want to query documents that are within 500 meters 55 // of a polyline, you could use an S2ShapeIndexBufferedRegion containing the 56 // polyline with a radius of 500 meters. 57 // 58 // Example usage: 59 // 60 // // This class is intended to be used with an external key-value store, 61 // // but for this example will we use an unordered_map. The key is an 62 // // index term, and the value is a set of document ids. 63 // std::unordered_map<string, std::vector<int>> index; 64 // 65 // // Create an indexer that uses up to 10 cells to approximate each region. 66 // S2RegionTermIndexer::Options options; 67 // options.set_max_cells(10); 68 // S2RegionTermIndexer indexer(options); 69 // 70 // // For this example, we index a disc-shaped region with a 10km radius. 71 // S2LatLng center = S2LatLng::FromDegrees(44.1, -56.235); 72 // S1Angle radius = S2Earth::ToAngle(util::units::Kilometers(10.0)); 73 // S2Cap cap(center.ToPoint(), radius); 74 // 75 // // Add the terms for this disc-shaped region to the index. 76 // for (const auto& term : indexer.GetIndexTerms(cap)) { 77 // index[term].push_back(kSomeDocumentId); 78 // } 79 // 80 // // And now at query time: build a latitude-longitude rectangle. 81 // S2LatLngRect rect(S2LatLng::FromDegrees(-12.1, 10.2), 82 // S2LatLng::FromDegrees(-9.2, 120.5)); 83 // 84 // // Convert the query region to a set of terms, and compute the union 85 // // of the document ids associated with those terms. 86 // std::set<int> doc_ids; 87 // for (const auto& term : indexer.GetQueryTerms(rect)) { 88 // doc_ids.insert(index[term].begin(), index[term].end()); 89 // } 90 // 91 // // "doc_ids" now contains all documents that intersect the query region, 92 // // along with some documents that nearly intersect it. The results can 93 // // be further pruned if desired by retrieving the original regions that 94 // // were indexed (i.e., the document contents) and checking for exact 95 // // intersection with the query region. 96 97 #ifndef S2_S2REGION_TERM_INDEXER_H_ 98 #define S2_S2REGION_TERM_INDEXER_H_ 99 100 #include <string> 101 #include <vector> 102 103 #include "s2/s2cell_union.h" 104 #include "s2/s2region.h" 105 #include "s2/s2region_coverer.h" 106 #include "s2/third_party/absl/strings/string_view.h" 107 108 class S2RegionTermIndexer { 109 public: 110 // The following parameters control the tradeoffs between index size, query 111 // size, and accuracy (see s2region_coverer.h for details). 112 // 113 // IMPORTANT: You must use the same values for min_level(), max_level(), and 114 // level_mod() for both indexing and queries, otherwise queries will return 115 // incorrect results. However, max_cells() can be changed as often as 116 // desired -- you can even change this parameter for every region. 117 118 class Options : public S2RegionCoverer::Options { 119 public: 120 Options(); 121 122 ///////////////// Options Inherited From S2RegionCoverer //////////////// 123 124 // max_cells() controls the maximum number of cells when approximating 125 // each region. This parameter value may be changed as often as desired 126 // (using mutable_options(), see below), e.g. to approximate some regions 127 // more accurately than others. 128 // 129 // Increasing this value during indexing will make indexes more accurate 130 // but larger. Increasing this value for queries will make queries more 131 // accurate but slower. (See s2region_coverer.h for details on how this 132 // parameter affects accuracy.) For example, if you don't mind large 133 // indexes but want fast serving, it might be reasonable to set 134 // max_cells() == 100 during indexing and max_cells() == 8 for queries. 135 // 136 // DEFAULT: 8 (coarse approximations) 137 using S2RegionCoverer::Options::max_cells; 138 using S2RegionCoverer::Options::set_max_cells; 139 140 // min_level() and max_level() control the minimum and maximum size of the 141 // S2Cells used to approximate regions. Setting these parameters 142 // appropriately can reduce the size of the index and speed up queries by 143 // reducing the number of terms needed. For example, if you know that 144 // your query regions will rarely be less than 100 meters in width, then 145 // you could set max_level() as follows: 146 // 147 // options.set_max_level(S2::kAvgEdge.GetClosestLevel( 148 // S2Earth::MetersToRadians(100))); 149 // 150 // This restricts the index to S2Cells that are approximately 100 meters 151 // across or larger. Similar, if you know that query regions will rarely 152 // be larger than 1000km across, then you could set min_level() similarly. 153 // 154 // If min_level() is set too high, then large regions may generate too 155 // many query terms. If max_level() is set too low, then small query 156 // regions will not be able to discriminate which regions they intersect 157 // very precisely and may return many more candidates than necessary. 158 // 159 // If you have no idea about the scale of the regions being queried, 160 // it is perfectly fine to set min_level() == 0 and max_level() == 30 161 // (== S2::kMaxLevel). The only drawback is that may result in a larger 162 // index and slower queries. 163 // 164 // The default parameter values are suitable for query regions ranging 165 // from about 100 meters to 3000 km across. 166 // 167 // DEFAULT: 4 (average cell width == 600km) 168 using S2RegionCoverer::Options::min_level; 169 using S2RegionCoverer::Options::set_min_level; 170 171 // DEFAULT: 16 (average cell width == 150m) 172 using S2RegionCoverer::Options::max_level; 173 using S2RegionCoverer::Options::set_max_level; 174 175 // Setting level_mod() to a value greater than 1 increases the effective 176 // branching factor of the S2Cell hierarchy by skipping some levels. For 177 // example, if level_mod() == 2 then every second level is skipped (which 178 // increases the effective branching factor to 16). You might want to 179 // consider doing this if your query regions are typically very small 180 // (e.g., single points) and you don't mind increasing the index size 181 // (since skipping levels will reduce the accuracy of cell coverings for a 182 // given max_cells() limit). 183 // 184 // DEFAULT: 1 (don't skip any cell levels) 185 using S2RegionCoverer::Options::level_mod; 186 using S2RegionCoverer::Options::set_level_mod; 187 188 // If your index will only contain points (rather than regions), be sure 189 // to set this flag. This will generate smaller and faster queries that 190 // are specialized for the points-only case. 191 // 192 // With the default quality settings, this flag reduces the number of 193 // query terms by about a factor of two. (The improvement gets smaller 194 // as max_cells() is increased, but there is really no reason not to use 195 // this flag if your index consists entirely of points.) 196 // 197 // DEFAULT: false index_contains_points_only()198 bool index_contains_points_only() const { return points_only_; } set_index_contains_points_only(bool value)199 void set_index_contains_points_only(bool value) { points_only_ = value; } 200 201 // If true, the index will be optimized for space rather than for query 202 // time. With the default quality settings, this flag reduces the number 203 // of index terms and increases the number of query terms by the same 204 // factor (approximately 1.3). The factor increases up to a limiting 205 // ratio of 2.0 as max_cells() is increased. 206 // 207 // CAVEAT: This option has no effect if the index contains only points. 208 // 209 // DEFAULT: false optimize_for_space()210 bool optimize_for_space() const { return optimize_for_space_; } set_optimize_for_space(bool value)211 void set_optimize_for_space(bool value) { optimize_for_space_ = value; } 212 213 // A non-alphanumeric character that is used internally to distinguish 214 // between two different types of terms (by adding this character). 215 // 216 // REQUIRES: "ch" is non-alphanumeric. 217 // DEFAULT: '$' marker()218 const string& marker() const { return marker_; } marker_character()219 char marker_character() const { return marker_[0]; } 220 void set_marker_character(char ch); 221 222 private: 223 bool points_only_ = false; 224 bool optimize_for_space_ = false; 225 string marker_ = string(1, '$'); 226 }; 227 228 // Default constructor. Options can be set using mutable_options(). 229 S2RegionTermIndexer(); 230 ~S2RegionTermIndexer(); 231 232 // Constructs an S2RegionTermIndexer with the given options. 233 explicit S2RegionTermIndexer(const Options& options); 234 235 // S2RegionTermIndexer is movable but not copyable. 236 S2RegionTermIndexer(const S2RegionTermIndexer&) = delete; 237 S2RegionTermIndexer& operator=(const S2RegionTermIndexer&) = delete; 238 S2RegionTermIndexer(S2RegionTermIndexer&&); 239 S2RegionTermIndexer& operator=(S2RegionTermIndexer&&); 240 241 // Returns the current options. Options can be modifed between calls. options()242 const Options& options() const { return options_; } mutable_options()243 Options* mutable_options() { return &options_; } 244 245 // Converts the given region into a set of terms for indexing. Terms 246 // consist of lowercase letters, numbers, '$', and an optional prefix. 247 // 248 // "prefix" is a unique prefix used to distinguish S2 terms from other terms 249 // in the repository. The prefix may also be used to index documents with 250 // multiple types of location information (e.g. store footprint, entrances, 251 // parking lots, etc). The prefix should be kept short since it is 252 // prepended to every term. 253 std::vector<string> GetIndexTerms(const S2Region& region, 254 absl::string_view prefix); 255 256 // Converts a given query region into a set of terms. If you compute the 257 // union of all the documents associated with these terms, the result will 258 // include all documents whose index region intersects the query region. 259 // 260 // "prefix" should match the corresponding value used when indexing. 261 std::vector<string> GetQueryTerms(const S2Region& region, 262 absl::string_view prefix); 263 264 // Convenience methods that accept an S2Point rather than S2Region. (These 265 // methods are also faster.) 266 // 267 // Note that you can index an S2LatLng by converting it to an S2Point first: 268 // auto terms = GetIndexTerms(S2Point(latlng), ...); 269 std::vector<string> GetIndexTerms(const S2Point& point, 270 absl::string_view prefix); 271 std::vector<string> GetQueryTerms(const S2Point& point, 272 absl::string_view prefix); 273 274 // Low-level methods that accept an S2CellUnion covering of the region to be 275 // indexed or queried. 276 // 277 // REQUIRES: "covering" satisfies the S2RegionCoverer::Options for this 278 // class (i.e., max_cells, min_level, max_level, and level_mod). 279 // 280 // If you have a covering that was computed using different options, then 281 // you can either call the regular S2Region methods (since S2CellUnion is a 282 // type of S2Region), or "canonicalize" the covering first by calling 283 // S2RegionCoverer::CanonicalizeCovering() with the same options. 284 std::vector<string> GetIndexTermsForCanonicalCovering( 285 const S2CellUnion& covering, absl::string_view prefix); 286 std::vector<string> GetQueryTermsForCanonicalCovering( 287 const S2CellUnion& covering, absl::string_view prefix); 288 289 private: 290 enum TermType { ANCESTOR, COVERING }; 291 292 string GetTerm(TermType term_type, const S2CellId& id, 293 absl::string_view prefix) const; 294 295 Options options_; 296 S2RegionCoverer coverer_; 297 }; 298 299 #endif // S2_S2REGION_TERM_INDEXER_H_ 300