Merge pull request #1809 from mattheww/2022-05_searchindex

Omit words longer than 80 characters from the search index
This commit is contained in:
Dylan DPC 2022-06-22 13:14:08 +02:00 committed by GitHub
commit 93aee6419e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 17 additions and 4 deletions

View File

@ -13,6 +13,8 @@ use crate::utils;
use serde::Serialize; use serde::Serialize;
const MAX_WORD_LENGTH_TO_INDEX: usize = 80;
/// Creates all files required for search. /// Creates all files required for search.
pub fn create_files(search_config: &Search, destination: &Path, book: &Book) -> Result<()> { pub fn create_files(search_config: &Search, destination: &Path, book: &Book) -> Result<()> {
let mut index = Index::new(&["title", "body", "breadcrumbs"]); let mut index = Index::new(&["title", "body", "breadcrumbs"]);
@ -44,6 +46,15 @@ pub fn create_files(search_config: &Search, destination: &Path, book: &Book) ->
Ok(()) Ok(())
} }
/// Tokenizes in the same way as elasticlunr-rs (for English), but also drops long tokens.
fn tokenize(text: &str) -> Vec<String> {
text.split(|c: char| c.is_whitespace() || c == '-')
.filter(|s| !s.is_empty())
.map(|s| s.trim().to_lowercase())
.filter(|s| s.len() <= MAX_WORD_LENGTH_TO_INDEX)
.collect()
}
/// Uses the given arguments to construct a search document, then inserts it to the given index. /// Uses the given arguments to construct a search document, then inserts it to the given index.
fn add_doc( fn add_doc(
index: &mut Index, index: &mut Index,
@ -62,7 +73,7 @@ fn add_doc(
doc_urls.push(url.into()); doc_urls.push(url.into());
let items = items.iter().map(|&x| utils::collapse_whitespace(x.trim())); let items = items.iter().map(|&x| utils::collapse_whitespace(x.trim()));
index.add_doc(&doc_ref, items); index.add_doc_with_tokenizer(&doc_ref, items, tokenize);
} }
/// Renders markdown into flat unformatted text and adds it to the search index. /// Renders markdown into flat unformatted text and adds it to the search index.

View File

@ -1,3 +1,5 @@
Capybara capybara capybara. Capybara capybara capybara.
Capybara capybara capybara. Capybara capybara capybara.
ThisLongWordIsIncludedSoWeCanCheckThatSufficientlyLongWordsAreOmittedFromTheSearchIndex.

View File

@ -772,7 +772,7 @@ mod search {
); );
assert_eq!( assert_eq!(
docs[&no_headers]["body"], docs[&no_headers]["body"],
"Capybara capybara capybara. Capybara capybara capybara." "Capybara capybara capybara. Capybara capybara capybara. ThisLongWordIsIncludedSoWeCanCheckThatSufficientlyLongWordsAreOmittedFromTheSearchIndex."
); );
} }

View File

@ -229,7 +229,7 @@
"title": "Unicode stress tests" "title": "Unicode stress tests"
}, },
"18": { "18": {
"body": "Capybara capybara capybara. Capybara capybara capybara.", "body": "Capybara capybara capybara. Capybara capybara capybara. ThisLongWordIsIncludedSoWeCanCheckThatSufficientlyLongWordsAreOmittedFromTheSearchIndex.",
"breadcrumbs": "First Chapter » No Headers", "breadcrumbs": "First Chapter » No Headers",
"id": "18", "id": "18",
"title": "First Chapter" "title": "First Chapter"