1 //!# elasticlunr-rs
2 //!
3 //! [![Build Status](https://travis-ci.org/mattico/elasticlunr-rs.svg?branch=master)](https://travis-ci.org/mattico/elasticlunr-rs)
4 //! [![Documentation](https://docs.rs/elasticlunr-rs/badge.svg)](https://docs.rs/elasticlunr-rs)
5 //! [![Crates.io](https://img.shields.io/crates/v/elasticlunr-rs.svg)](https://crates.io/crates/elasticlunr-rs)
6 //!
7 //! A partial port of [elasticlunr](https://github.com/weixsong/elasticlunr.js) to Rust. Intended to
8 //! be used for generating compatible search indices.
9 //!
10 //! Access to all index-generating functionality is provided. Most users will only need to use the
11 //! [`Index`](struct.Index.html) or [`IndexBuilder`](struct.IndexBuilder.html) types.
12 //!
13 //! ## Example
14 //!
15 //! ```
16 //! use std::fs::File;
17 //! use std::io::Write;
18 //! use elasticlunr::Index;
19 //!
20 //! let mut index = Index::new(&["title", "body"]);
21 //! index.add_doc("1", &["This is a title", "This is body text!"]);
22 //! // Add more docs...
23 //! let mut file = File::create("out.json").unwrap();
24 //! file.write_all(index.to_json_pretty().as_bytes());
25 //! ```
26 
27 #![cfg_attr(feature = "bench", feature(test))]
28 
29 #[macro_use]
30 extern crate lazy_static;
31 extern crate regex;
32 extern crate serde;
33 #[macro_use]
34 extern crate serde_derive;
35 extern crate serde_json;
36 extern crate strum;
37 #[macro_use]
38 extern crate strum_macros;
39 
40 #[cfg(feature = "rust-stemmers")]
41 extern crate rust_stemmers;
42 
43 #[cfg(test)]
44 #[macro_use]
45 extern crate maplit;
46 #[cfg(feature = "zh")]
47 extern crate jieba_rs;
48 #[cfg(feature = "ja")]
49 extern crate lindera;
50 
51 /// The version of elasticlunr.js this library was designed for.
52 pub const ELASTICLUNR_VERSION: &str = "0.9.5";
53 
54 pub mod config;
55 pub mod document_store;
56 pub mod inverted_index;
57 pub mod lang;
58 pub mod pipeline;
59 
60 use std::collections::{BTreeMap, BTreeSet};
61 
62 use document_store::DocumentStore;
63 use inverted_index::InvertedIndex;
64 pub use lang::Language;
65 pub use pipeline::Pipeline;
66 use pipeline::TokenizerFn;
67 
68 /// A builder for an `Index` with custom parameters.
69 ///
70 /// # Example
71 /// ```
72 /// # use elasticlunr::{Index, IndexBuilder};
73 /// let mut index = IndexBuilder::new()
74 ///     .save_docs(false)
75 ///     .add_fields(&["title", "subtitle", "body"])
76 ///     .set_ref("doc_id")
77 ///     .build();
78 /// index.add_doc("doc_a", &["Chapter 1", "Welcome to Copenhagen", "..."]);
79 /// ```
80 pub struct IndexBuilder {
81     save: bool,
82     fields: BTreeSet<String>,
83     ref_field: String,
84     pipeline: Option<Pipeline>,
85 }
86 
87 impl Default for IndexBuilder {
default() -> Self88     fn default() -> Self {
89         IndexBuilder {
90             save: true,
91             fields: BTreeSet::new(),
92             ref_field: "id".into(),
93             pipeline: None,
94         }
95     }
96 }
97 
98 impl IndexBuilder {
new() -> Self99     pub fn new() -> Self {
100         Default::default()
101     }
102 
103     /// Set whether or not documents should be saved in the `Index`'s document store.
save_docs(mut self, save: bool) -> Self104     pub fn save_docs(mut self, save: bool) -> Self {
105         self.save = save;
106         self
107     }
108 
109     /// Add a document field to the `Index`.
110     ///
111     /// If the `Index` already contains a field with an identical name, adding it again is a no-op.
add_field(mut self, field: &str) -> Self112     pub fn add_field(mut self, field: &str) -> Self {
113         self.fields.insert(field.into());
114         self
115     }
116 
117     /// Add the document fields to the `Index`.
118     ///
119     /// If the `Index` already contains a field with an identical name, adding it again is a no-op.
add_fields<I>(mut self, fields: I) -> Self where I: IntoIterator, I::Item: AsRef<str>,120     pub fn add_fields<I>(mut self, fields: I) -> Self
121     where
122         I: IntoIterator,
123         I::Item: AsRef<str>,
124     {
125         self.fields
126             .extend(fields.into_iter().map(|f| f.as_ref().into()));
127         self
128     }
129 
130     /// Set the key used to store the document reference field.
set_ref(mut self, ref_field: &str) -> Self131     pub fn set_ref(mut self, ref_field: &str) -> Self {
132         self.ref_field = ref_field.into();
133         self
134     }
135 
136     /// Set the pipeline used by the `Index`.
set_pipeline(mut self, pipeline: Pipeline) -> Self137     pub fn set_pipeline(mut self, pipeline: Pipeline) -> Self {
138         self.pipeline = Some(pipeline);
139         self
140     }
141 
142     /// Build an `Index` from this builder.
build(self) -> Index143     pub fn build(self) -> Index {
144         let index = self
145             .fields
146             .iter()
147             .map(|f| (f.clone(), InvertedIndex::new()))
148             .collect();
149 
150         Index {
151             index,
152             fields: self.fields.into_iter().collect(),
153             ref_field: self.ref_field,
154             document_store: DocumentStore::new(self.save),
155             pipeline: self.pipeline.unwrap_or_default(),
156             version: ::ELASTICLUNR_VERSION,
157             lang: Language::English,
158         }
159     }
160 }
161 
162 /// An elasticlunr search index.
163 #[derive(Serialize, Deserialize, Debug)]
164 #[serde(rename_all = "camelCase")]
165 pub struct Index {
166     // TODO(3.0): Use a BTreeSet<String>
167     pub fields: Vec<String>,
168     pub pipeline: Pipeline,
169     #[serde(rename = "ref")]
170     pub ref_field: String,
171     pub version: &'static str,
172     index: BTreeMap<String, InvertedIndex>,
173     pub document_store: DocumentStore,
174     lang: Language,
175 }
176 
177 impl Index {
178     /// Create a new index with the provided fields.
179     ///
180     /// # Example
181     ///
182     /// ```
183     /// # use elasticlunr::Index;
184     /// let mut index = Index::new(&["title", "body", "breadcrumbs"]);
185     /// index.add_doc("1", &["How to Foo", "First, you need to `bar`.", "Chapter 1 > How to Foo"]);
186     /// ```
187     ///
188     /// # Panics
189     ///
190     /// Panics if multiple given fields are identical.
new<I>(fields: I) -> Self where I: IntoIterator, I::Item: AsRef<str>,191     pub fn new<I>(fields: I) -> Self
192     where
193         I: IntoIterator,
194         I::Item: AsRef<str>,
195     {
196         Index::with_language(Language::English, fields)
197     }
198 
199     /// Create a new index with the provided fields for the given
200     /// [`Language`](lang/enum.Language.html).
201     ///
202     /// # Example
203     ///
204     /// ```
205     /// # use elasticlunr::{Index, Language};
206     /// let mut index = Index::with_language(Language::English, &["title", "body"]);
207     /// index.add_doc("1", &["this is a title", "this is body text"]);
208     /// ```
209     ///
210     /// # Panics
211     ///
212     /// Panics if multiple given fields are identical.
with_language<I>(lang: Language, fields: I) -> Self where I: IntoIterator, I::Item: AsRef<str>,213     pub fn with_language<I>(lang: Language, fields: I) -> Self
214     where
215         I: IntoIterator,
216         I::Item: AsRef<str>,
217     {
218         let mut indices = BTreeMap::new();
219         let mut field_vec = Vec::new();
220         for field in fields {
221             let field = field.as_ref().to_string();
222             if field_vec.contains(&field) {
223                 panic!("The Index already contains the field {}", field);
224             }
225             field_vec.push(field.clone());
226             indices.insert(field, InvertedIndex::new());
227         }
228 
229         Index {
230             fields: field_vec,
231             index: indices,
232             pipeline: lang.make_pipeline(),
233             ref_field: "id".into(),
234             version: ::ELASTICLUNR_VERSION,
235             document_store: DocumentStore::new(true),
236             lang: lang,
237         }
238     }
239 
240     /// Add the data from a document to the index.
241     ///
242     /// *NOTE: The elements of `data` should be provided in the same order as
243     /// the fields used to create the index.*
244     ///
245     /// # Example
246     /// ```
247     /// # use elasticlunr::Index;
248     /// let mut index = Index::new(&["title", "body"]);
249     /// index.add_doc("1", &["this is a title", "this is body text"]);
250     /// ```
add_doc<I>(&mut self, doc_ref: &str, data: I) where I: IntoIterator, I::Item: AsRef<str>,251     pub fn add_doc<I>(&mut self, doc_ref: &str, data: I)
252     where
253         I: IntoIterator,
254         I::Item: AsRef<str>,
255     {
256         let tokenizer = match self.lang {
257             #[cfg(feature = "zh")]
258             Language::Chinese => pipeline::tokenize_chinese,
259             #[cfg(feature = "ja")]
260             Language::Japanese => pipeline::tokenize_japanese,
261             _ => pipeline::tokenize,
262         };
263         self.add_doc_with_tokenizer(doc_ref, data, tokenizer)
264     }
265 
266     /// Add the data from a document to the index.
267     ///
268     /// *NOTE: The elements of `data` should be provided in the same order as
269     /// the fields used to create the index.*
270     ///
271     /// # Example
272     /// ```
273     /// # use elasticlunr::Index;
274     /// fn css_tokenizer(text: &str) -> Vec<String> {
275     ///     text.split(|c: char| c.is_whitespace())
276     ///         .filter(|s| !s.is_empty())
277     ///         .map(|s| s.trim().to_lowercase())
278     ///         .collect()
279     /// }
280     /// let mut index = Index::new(&["title", "body"]);
281     /// index.add_doc_with_tokenizer("1", &["this is a title", "this is body text"], css_tokenizer);
282     /// ```
add_doc_with_tokenizer<I>(&mut self, doc_ref: &str, data: I, tokenizer: TokenizerFn) where I: IntoIterator, I::Item: AsRef<str>,283     pub fn add_doc_with_tokenizer<I>(&mut self, doc_ref: &str, data: I, tokenizer: TokenizerFn)
284     where
285         I: IntoIterator,
286         I::Item: AsRef<str>,
287     {
288         self.add_doc_with_tokenizers(doc_ref, data, std::iter::repeat(tokenizer));
289     }
290 
291     /// Add the data from a document to the index.
292     ///
293     /// *NOTE: The elements of `data` and `tokenizers` should be provided in
294     /// the same order as the fields used to create the index.*
295     ///
296     /// # Example
297     /// ```
298     /// # use elasticlunr::Index;
299     /// use elasticlunr::pipeline::{tokenize, TokenizerFn};
300     /// fn css_tokenizer(text: &str) -> Vec<String> {
301     ///     text.split(|c: char| c.is_whitespace())
302     ///         .filter(|s| !s.is_empty())
303     ///         .map(|s| s.trim().to_lowercase())
304     ///         .collect()
305     /// }
306     /// let mut index = Index::new(&["title", "body"]);
307     /// let tokenizers: Vec<TokenizerFn> = vec![tokenize, css_tokenizer];
308     /// index.add_doc_with_tokenizers("1", &["this is a title", "this is body text"], tokenizers);
309     /// ```
add_doc_with_tokenizers<I, T>(&mut self, doc_ref: &str, data: I, tokenizers: T) where I: IntoIterator, I::Item: AsRef<str>, T: IntoIterator<Item=TokenizerFn>,310     pub fn add_doc_with_tokenizers<I, T>(&mut self, doc_ref: &str, data: I, tokenizers: T)
311     where
312         I: IntoIterator,
313         I::Item: AsRef<str>,
314         T: IntoIterator<Item=TokenizerFn>,
315     {
316         let mut doc = BTreeMap::new();
317         doc.insert(self.ref_field.clone(), doc_ref.into());
318         let mut token_freq = BTreeMap::new();
319 
320         for ((field, value), tokenizer) in self.fields.iter().zip(data).zip(tokenizers) {
321             doc.insert(field.clone(), value.as_ref().to_string());
322 
323             if field == &self.ref_field {
324                 continue;
325             }
326 
327             let raw_tokens = tokenizer(value.as_ref());
328 
329             let tokens = self.pipeline.run(raw_tokens);
330 
331             self.document_store
332                 .add_field_length(doc_ref, field, tokens.len());
333 
334             for token in tokens {
335                 *token_freq.entry(token).or_insert(0u64) += 1;
336             }
337 
338             for (token, count) in &token_freq {
339                 let freq = (*count as f64).sqrt();
340 
341                 self.index
342                     .get_mut(field)
343                     .expect(&format!("InvertedIndex does not exist for field {}", field))
344                     .add_token(doc_ref, token, freq);
345             }
346         }
347 
348         self.document_store.add_doc(doc_ref, doc);
349     }
350 
get_fields(&self) -> &[String]351     pub fn get_fields(&self) -> &[String] {
352         &self.fields
353     }
354 
355     /// Returns the index, serialized to pretty-printed JSON.
to_json_pretty(&self) -> String356     pub fn to_json_pretty(&self) -> String {
357         serde_json::to_string_pretty(&self).unwrap()
358     }
359 
360     /// Returns the index, serialized to JSON.
to_json(&self) -> String361     pub fn to_json(&self) -> String {
362         serde_json::to_string(&self).unwrap()
363     }
364 }
365 
366 #[cfg(test)]
367 mod tests {
368     use super::*;
369 
370     #[test]
add_field_to_builder()371     fn add_field_to_builder() {
372         let idx = IndexBuilder::new()
373             .add_field("foo")
374             .add_fields(&["foo", "bar", "baz"])
375             .build();
376 
377         let idx_fields = idx.get_fields();
378         for f in &["foo", "bar", "baz"] {
379             assert_eq!(idx_fields.iter().filter(|x| x == f).count(), 1);
380         }
381     }
382 
383     #[test]
adding_document_to_index()384     fn adding_document_to_index() {
385         let mut idx = Index::new(&["body"]);
386         idx.add_doc("1", &["this is a test"]);
387 
388         assert_eq!(idx.document_store.len(), 1);
389         assert_eq!(
390             idx.document_store.get_doc("1").unwrap(),
391             btreemap! {
392                 "id".into() => "1".into(),
393                 "body".into() => "this is a test".into(),
394             }
395         );
396     }
397 
398     #[test]
adding_document_with_empty_field()399     fn adding_document_with_empty_field() {
400         let mut idx = Index::new(&["title", "body"]);
401 
402         idx.add_doc("1", &["", "test"]);
403         assert_eq!(idx.index["body"].get_doc_frequency("test"), 1);
404         assert_eq!(idx.index["body"].get_docs("test").unwrap()["1"], 1.);
405     }
406 
407     #[test]
408     #[should_panic]
creating_index_with_identical_fields_panics()409     fn creating_index_with_identical_fields_panics() {
410         let _idx = Index::new(&["title", "body", "title"]);
411     }
412 }
413