1 //!# elasticlunr-rs 2 //! 3 //! [![Build Status](https://travis-ci.org/mattico/elasticlunr-rs.svg?branch=master)](https://travis-ci.org/mattico/elasticlunr-rs) 4 //! [![Documentation](https://docs.rs/elasticlunr-rs/badge.svg)](https://docs.rs/elasticlunr-rs) 5 //! [![Crates.io](https://img.shields.io/crates/v/elasticlunr-rs.svg)](https://crates.io/crates/elasticlunr-rs) 6 //! 7 //! A partial port of [elasticlunr](https://github.com/weixsong/elasticlunr.js) to Rust. Intended to 8 //! be used for generating compatible search indices. 9 //! 10 //! Access to all index-generating functionality is provided. Most users will only need to use the 11 //! [`Index`](struct.Index.html) or [`IndexBuilder`](struct.IndexBuilder.html) types. 12 //! 13 //! ## Example 14 //! 15 //! ``` 16 //! use std::fs::File; 17 //! use std::io::Write; 18 //! use elasticlunr::Index; 19 //! 20 //! let mut index = Index::new(&["title", "body"]); 21 //! index.add_doc("1", &["This is a title", "This is body text!"]); 22 //! // Add more docs... 23 //! let mut file = File::create("out.json").unwrap(); 24 //! file.write_all(index.to_json_pretty().as_bytes()); 25 //! ``` 26 27 #![cfg_attr(feature = "bench", feature(test))] 28 29 #[macro_use] 30 extern crate lazy_static; 31 extern crate regex; 32 extern crate serde; 33 #[macro_use] 34 extern crate serde_derive; 35 extern crate serde_json; 36 extern crate strum; 37 #[macro_use] 38 extern crate strum_macros; 39 40 #[cfg(feature = "rust-stemmers")] 41 extern crate rust_stemmers; 42 43 #[cfg(test)] 44 #[macro_use] 45 extern crate maplit; 46 #[cfg(feature = "zh")] 47 extern crate jieba_rs; 48 #[cfg(feature = "ja")] 49 extern crate lindera; 50 51 /// The version of elasticlunr.js this library was designed for. 52 pub const ELASTICLUNR_VERSION: &str = "0.9.5"; 53 54 pub mod config; 55 pub mod document_store; 56 pub mod inverted_index; 57 pub mod lang; 58 pub mod pipeline; 59 60 use std::collections::{BTreeMap, BTreeSet}; 61 62 use document_store::DocumentStore; 63 use inverted_index::InvertedIndex; 64 pub use lang::Language; 65 pub use pipeline::Pipeline; 66 use pipeline::TokenizerFn; 67 68 /// A builder for an `Index` with custom parameters. 69 /// 70 /// # Example 71 /// ``` 72 /// # use elasticlunr::{Index, IndexBuilder}; 73 /// let mut index = IndexBuilder::new() 74 /// .save_docs(false) 75 /// .add_fields(&["title", "subtitle", "body"]) 76 /// .set_ref("doc_id") 77 /// .build(); 78 /// index.add_doc("doc_a", &["Chapter 1", "Welcome to Copenhagen", "..."]); 79 /// ``` 80 pub struct IndexBuilder { 81 save: bool, 82 fields: BTreeSet<String>, 83 ref_field: String, 84 pipeline: Option<Pipeline>, 85 } 86 87 impl Default for IndexBuilder { default() -> Self88 fn default() -> Self { 89 IndexBuilder { 90 save: true, 91 fields: BTreeSet::new(), 92 ref_field: "id".into(), 93 pipeline: None, 94 } 95 } 96 } 97 98 impl IndexBuilder { new() -> Self99 pub fn new() -> Self { 100 Default::default() 101 } 102 103 /// Set whether or not documents should be saved in the `Index`'s document store. save_docs(mut self, save: bool) -> Self104 pub fn save_docs(mut self, save: bool) -> Self { 105 self.save = save; 106 self 107 } 108 109 /// Add a document field to the `Index`. 110 /// 111 /// If the `Index` already contains a field with an identical name, adding it again is a no-op. add_field(mut self, field: &str) -> Self112 pub fn add_field(mut self, field: &str) -> Self { 113 self.fields.insert(field.into()); 114 self 115 } 116 117 /// Add the document fields to the `Index`. 118 /// 119 /// If the `Index` already contains a field with an identical name, adding it again is a no-op. add_fields<I>(mut self, fields: I) -> Self where I: IntoIterator, I::Item: AsRef<str>,120 pub fn add_fields<I>(mut self, fields: I) -> Self 121 where 122 I: IntoIterator, 123 I::Item: AsRef<str>, 124 { 125 self.fields 126 .extend(fields.into_iter().map(|f| f.as_ref().into())); 127 self 128 } 129 130 /// Set the key used to store the document reference field. set_ref(mut self, ref_field: &str) -> Self131 pub fn set_ref(mut self, ref_field: &str) -> Self { 132 self.ref_field = ref_field.into(); 133 self 134 } 135 136 /// Set the pipeline used by the `Index`. set_pipeline(mut self, pipeline: Pipeline) -> Self137 pub fn set_pipeline(mut self, pipeline: Pipeline) -> Self { 138 self.pipeline = Some(pipeline); 139 self 140 } 141 142 /// Build an `Index` from this builder. build(self) -> Index143 pub fn build(self) -> Index { 144 let index = self 145 .fields 146 .iter() 147 .map(|f| (f.clone(), InvertedIndex::new())) 148 .collect(); 149 150 Index { 151 index, 152 fields: self.fields.into_iter().collect(), 153 ref_field: self.ref_field, 154 document_store: DocumentStore::new(self.save), 155 pipeline: self.pipeline.unwrap_or_default(), 156 version: ::ELASTICLUNR_VERSION, 157 lang: Language::English, 158 } 159 } 160 } 161 162 /// An elasticlunr search index. 163 #[derive(Serialize, Deserialize, Debug)] 164 #[serde(rename_all = "camelCase")] 165 pub struct Index { 166 // TODO(3.0): Use a BTreeSet<String> 167 pub fields: Vec<String>, 168 pub pipeline: Pipeline, 169 #[serde(rename = "ref")] 170 pub ref_field: String, 171 pub version: &'static str, 172 index: BTreeMap<String, InvertedIndex>, 173 pub document_store: DocumentStore, 174 lang: Language, 175 } 176 177 impl Index { 178 /// Create a new index with the provided fields. 179 /// 180 /// # Example 181 /// 182 /// ``` 183 /// # use elasticlunr::Index; 184 /// let mut index = Index::new(&["title", "body", "breadcrumbs"]); 185 /// index.add_doc("1", &["How to Foo", "First, you need to `bar`.", "Chapter 1 > How to Foo"]); 186 /// ``` 187 /// 188 /// # Panics 189 /// 190 /// Panics if multiple given fields are identical. new<I>(fields: I) -> Self where I: IntoIterator, I::Item: AsRef<str>,191 pub fn new<I>(fields: I) -> Self 192 where 193 I: IntoIterator, 194 I::Item: AsRef<str>, 195 { 196 Index::with_language(Language::English, fields) 197 } 198 199 /// Create a new index with the provided fields for the given 200 /// [`Language`](lang/enum.Language.html). 201 /// 202 /// # Example 203 /// 204 /// ``` 205 /// # use elasticlunr::{Index, Language}; 206 /// let mut index = Index::with_language(Language::English, &["title", "body"]); 207 /// index.add_doc("1", &["this is a title", "this is body text"]); 208 /// ``` 209 /// 210 /// # Panics 211 /// 212 /// Panics if multiple given fields are identical. with_language<I>(lang: Language, fields: I) -> Self where I: IntoIterator, I::Item: AsRef<str>,213 pub fn with_language<I>(lang: Language, fields: I) -> Self 214 where 215 I: IntoIterator, 216 I::Item: AsRef<str>, 217 { 218 let mut indices = BTreeMap::new(); 219 let mut field_vec = Vec::new(); 220 for field in fields { 221 let field = field.as_ref().to_string(); 222 if field_vec.contains(&field) { 223 panic!("The Index already contains the field {}", field); 224 } 225 field_vec.push(field.clone()); 226 indices.insert(field, InvertedIndex::new()); 227 } 228 229 Index { 230 fields: field_vec, 231 index: indices, 232 pipeline: lang.make_pipeline(), 233 ref_field: "id".into(), 234 version: ::ELASTICLUNR_VERSION, 235 document_store: DocumentStore::new(true), 236 lang: lang, 237 } 238 } 239 240 /// Add the data from a document to the index. 241 /// 242 /// *NOTE: The elements of `data` should be provided in the same order as 243 /// the fields used to create the index.* 244 /// 245 /// # Example 246 /// ``` 247 /// # use elasticlunr::Index; 248 /// let mut index = Index::new(&["title", "body"]); 249 /// index.add_doc("1", &["this is a title", "this is body text"]); 250 /// ``` add_doc<I>(&mut self, doc_ref: &str, data: I) where I: IntoIterator, I::Item: AsRef<str>,251 pub fn add_doc<I>(&mut self, doc_ref: &str, data: I) 252 where 253 I: IntoIterator, 254 I::Item: AsRef<str>, 255 { 256 let tokenizer = match self.lang { 257 #[cfg(feature = "zh")] 258 Language::Chinese => pipeline::tokenize_chinese, 259 #[cfg(feature = "ja")] 260 Language::Japanese => pipeline::tokenize_japanese, 261 _ => pipeline::tokenize, 262 }; 263 self.add_doc_with_tokenizer(doc_ref, data, tokenizer) 264 } 265 266 /// Add the data from a document to the index. 267 /// 268 /// *NOTE: The elements of `data` should be provided in the same order as 269 /// the fields used to create the index.* 270 /// 271 /// # Example 272 /// ``` 273 /// # use elasticlunr::Index; 274 /// fn css_tokenizer(text: &str) -> Vec<String> { 275 /// text.split(|c: char| c.is_whitespace()) 276 /// .filter(|s| !s.is_empty()) 277 /// .map(|s| s.trim().to_lowercase()) 278 /// .collect() 279 /// } 280 /// let mut index = Index::new(&["title", "body"]); 281 /// index.add_doc_with_tokenizer("1", &["this is a title", "this is body text"], css_tokenizer); 282 /// ``` add_doc_with_tokenizer<I>(&mut self, doc_ref: &str, data: I, tokenizer: TokenizerFn) where I: IntoIterator, I::Item: AsRef<str>,283 pub fn add_doc_with_tokenizer<I>(&mut self, doc_ref: &str, data: I, tokenizer: TokenizerFn) 284 where 285 I: IntoIterator, 286 I::Item: AsRef<str>, 287 { 288 self.add_doc_with_tokenizers(doc_ref, data, std::iter::repeat(tokenizer)); 289 } 290 291 /// Add the data from a document to the index. 292 /// 293 /// *NOTE: The elements of `data` and `tokenizers` should be provided in 294 /// the same order as the fields used to create the index.* 295 /// 296 /// # Example 297 /// ``` 298 /// # use elasticlunr::Index; 299 /// use elasticlunr::pipeline::{tokenize, TokenizerFn}; 300 /// fn css_tokenizer(text: &str) -> Vec<String> { 301 /// text.split(|c: char| c.is_whitespace()) 302 /// .filter(|s| !s.is_empty()) 303 /// .map(|s| s.trim().to_lowercase()) 304 /// .collect() 305 /// } 306 /// let mut index = Index::new(&["title", "body"]); 307 /// let tokenizers: Vec<TokenizerFn> = vec![tokenize, css_tokenizer]; 308 /// index.add_doc_with_tokenizers("1", &["this is a title", "this is body text"], tokenizers); 309 /// ``` add_doc_with_tokenizers<I, T>(&mut self, doc_ref: &str, data: I, tokenizers: T) where I: IntoIterator, I::Item: AsRef<str>, T: IntoIterator<Item=TokenizerFn>,310 pub fn add_doc_with_tokenizers<I, T>(&mut self, doc_ref: &str, data: I, tokenizers: T) 311 where 312 I: IntoIterator, 313 I::Item: AsRef<str>, 314 T: IntoIterator<Item=TokenizerFn>, 315 { 316 let mut doc = BTreeMap::new(); 317 doc.insert(self.ref_field.clone(), doc_ref.into()); 318 let mut token_freq = BTreeMap::new(); 319 320 for ((field, value), tokenizer) in self.fields.iter().zip(data).zip(tokenizers) { 321 doc.insert(field.clone(), value.as_ref().to_string()); 322 323 if field == &self.ref_field { 324 continue; 325 } 326 327 let raw_tokens = tokenizer(value.as_ref()); 328 329 let tokens = self.pipeline.run(raw_tokens); 330 331 self.document_store 332 .add_field_length(doc_ref, field, tokens.len()); 333 334 for token in tokens { 335 *token_freq.entry(token).or_insert(0u64) += 1; 336 } 337 338 for (token, count) in &token_freq { 339 let freq = (*count as f64).sqrt(); 340 341 self.index 342 .get_mut(field) 343 .expect(&format!("InvertedIndex does not exist for field {}", field)) 344 .add_token(doc_ref, token, freq); 345 } 346 } 347 348 self.document_store.add_doc(doc_ref, doc); 349 } 350 get_fields(&self) -> &[String]351 pub fn get_fields(&self) -> &[String] { 352 &self.fields 353 } 354 355 /// Returns the index, serialized to pretty-printed JSON. to_json_pretty(&self) -> String356 pub fn to_json_pretty(&self) -> String { 357 serde_json::to_string_pretty(&self).unwrap() 358 } 359 360 /// Returns the index, serialized to JSON. to_json(&self) -> String361 pub fn to_json(&self) -> String { 362 serde_json::to_string(&self).unwrap() 363 } 364 } 365 366 #[cfg(test)] 367 mod tests { 368 use super::*; 369 370 #[test] add_field_to_builder()371 fn add_field_to_builder() { 372 let idx = IndexBuilder::new() 373 .add_field("foo") 374 .add_fields(&["foo", "bar", "baz"]) 375 .build(); 376 377 let idx_fields = idx.get_fields(); 378 for f in &["foo", "bar", "baz"] { 379 assert_eq!(idx_fields.iter().filter(|x| x == f).count(), 1); 380 } 381 } 382 383 #[test] adding_document_to_index()384 fn adding_document_to_index() { 385 let mut idx = Index::new(&["body"]); 386 idx.add_doc("1", &["this is a test"]); 387 388 assert_eq!(idx.document_store.len(), 1); 389 assert_eq!( 390 idx.document_store.get_doc("1").unwrap(), 391 btreemap! { 392 "id".into() => "1".into(), 393 "body".into() => "this is a test".into(), 394 } 395 ); 396 } 397 398 #[test] adding_document_with_empty_field()399 fn adding_document_with_empty_field() { 400 let mut idx = Index::new(&["title", "body"]); 401 402 idx.add_doc("1", &["", "test"]); 403 assert_eq!(idx.index["body"].get_doc_frequency("test"), 1); 404 assert_eq!(idx.index["body"].get_docs("test").unwrap()["1"], 1.); 405 } 406 407 #[test] 408 #[should_panic] creating_index_with_identical_fields_panics()409 fn creating_index_with_identical_fields_panics() { 410 let _idx = Index::new(&["title", "body", "title"]); 411 } 412 } 413