Merge pull request #1809 from mattheww/2022-05_searchindex
Omit words longer than 80 characters from the search index
This commit is contained in:
commit
93aee6419e
|
@ -13,6 +13,8 @@ use crate::utils;
|
||||||
|
|
||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
|
|
||||||
|
const MAX_WORD_LENGTH_TO_INDEX: usize = 80;
|
||||||
|
|
||||||
/// Creates all files required for search.
|
/// Creates all files required for search.
|
||||||
pub fn create_files(search_config: &Search, destination: &Path, book: &Book) -> Result<()> {
|
pub fn create_files(search_config: &Search, destination: &Path, book: &Book) -> Result<()> {
|
||||||
let mut index = Index::new(&["title", "body", "breadcrumbs"]);
|
let mut index = Index::new(&["title", "body", "breadcrumbs"]);
|
||||||
|
@ -44,6 +46,15 @@ pub fn create_files(search_config: &Search, destination: &Path, book: &Book) ->
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Tokenizes in the same way as elasticlunr-rs (for English), but also drops long tokens.
|
||||||
|
fn tokenize(text: &str) -> Vec<String> {
|
||||||
|
text.split(|c: char| c.is_whitespace() || c == '-')
|
||||||
|
.filter(|s| !s.is_empty())
|
||||||
|
.map(|s| s.trim().to_lowercase())
|
||||||
|
.filter(|s| s.len() <= MAX_WORD_LENGTH_TO_INDEX)
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
/// Uses the given arguments to construct a search document, then inserts it to the given index.
|
/// Uses the given arguments to construct a search document, then inserts it to the given index.
|
||||||
fn add_doc(
|
fn add_doc(
|
||||||
index: &mut Index,
|
index: &mut Index,
|
||||||
|
@ -62,7 +73,7 @@ fn add_doc(
|
||||||
doc_urls.push(url.into());
|
doc_urls.push(url.into());
|
||||||
|
|
||||||
let items = items.iter().map(|&x| utils::collapse_whitespace(x.trim()));
|
let items = items.iter().map(|&x| utils::collapse_whitespace(x.trim()));
|
||||||
index.add_doc(&doc_ref, items);
|
index.add_doc_with_tokenizer(&doc_ref, items, tokenize);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Renders markdown into flat unformatted text and adds it to the search index.
|
/// Renders markdown into flat unformatted text and adds it to the search index.
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
Capybara capybara capybara.
|
Capybara capybara capybara.
|
||||||
|
|
||||||
Capybara capybara capybara.
|
Capybara capybara capybara.
|
||||||
|
|
||||||
|
ThisLongWordIsIncludedSoWeCanCheckThatSufficientlyLongWordsAreOmittedFromTheSearchIndex.
|
||||||
|
|
|
@ -772,7 +772,7 @@ mod search {
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
docs[&no_headers]["body"],
|
docs[&no_headers]["body"],
|
||||||
"Capybara capybara capybara. Capybara capybara capybara."
|
"Capybara capybara capybara. Capybara capybara capybara. ThisLongWordIsIncludedSoWeCanCheckThatSufficientlyLongWordsAreOmittedFromTheSearchIndex."
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -229,7 +229,7 @@
|
||||||
"title": "Unicode stress tests"
|
"title": "Unicode stress tests"
|
||||||
},
|
},
|
||||||
"18": {
|
"18": {
|
||||||
"body": "Capybara capybara capybara. Capybara capybara capybara.",
|
"body": "Capybara capybara capybara. Capybara capybara capybara. ThisLongWordIsIncludedSoWeCanCheckThatSufficientlyLongWordsAreOmittedFromTheSearchIndex.",
|
||||||
"breadcrumbs": "First Chapter » No Headers",
|
"breadcrumbs": "First Chapter » No Headers",
|
||||||
"id": "18",
|
"id": "18",
|
||||||
"title": "First Chapter"
|
"title": "First Chapter"
|
||||||
|
|
Loading…
Reference in New Issue