From d65ce55453b196f0a86a9636a98222a3d68e4572 Mon Sep 17 00:00:00 2001
From: Matthew Woodcraft <matthew@woodcraft.me.uk>
Date: Sun, 22 May 2022 13:37:19 +0100
Subject: [PATCH 1/2] When creating the search index, omit words longer than 80
 characters

This avoids creating deeply nested objects in searchindex.json
---
 src/renderer/html_handlebars/search.rs | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)
diff --git a/src/renderer/html_handlebars/search.rs b/src/renderer/html_handlebars/search.rs
index 0a59ffe9..b39569d4 100644
--- a/src/renderer/html_handlebars/search.rs
+++ b/src/renderer/html_handlebars/search.rs
@@ -13,6 +13,8 @@ use crate::utils;
 
 use serde::Serialize;
 
+const MAX_WORD_LENGTH_TO_INDEX: usize = 80;
+
 /// Creates all files required for search.
 pub fn create_files(search_config: &Search, destination: &Path, book: &Book) -> Result<()> {
     let mut index = Index::new(&["title", "body", "breadcrumbs"]);
@@ -44,6 +46,15 @@ pub fn create_files(search_config: &Search, destination: &Path, book: &Book) ->
     Ok(())
 }
 
+/// Tokenizes in the same way as elasticlunr-rs (for English), but also drops long tokens.
+fn tokenize(text: &str) -> Vec<String> {
+    text.split(|c: char| c.is_whitespace() || c == '-')
+        .filter(|s| !s.is_empty())
+        .map(|s| s.trim().to_lowercase())
+        .filter(|s| s.len() <= MAX_WORD_LENGTH_TO_INDEX)
+        .collect()
+}
+
 /// Uses the given arguments to construct a search document, then inserts it to the given index.
 fn add_doc(
     index: &mut Index,
@@ -62,7 +73,7 @@ fn add_doc(
     doc_urls.push(url.into());
 
     let items = items.iter().map(|&x| utils::collapse_whitespace(x.trim()));
-    index.add_doc(&doc_ref, items);
+    index.add_doc_with_tokenizer(&doc_ref, items, tokenize);
 }
 
 /// Renders markdown into flat unformatted text and adds it to the search index.

From 00a55b35a8c58e1b270074469cb6382ad88f063c Mon Sep 17 00:00:00 2001
From: Matthew Woodcraft <matthew@woodcraft.me.uk>
Date: Sun, 22 May 2022 13:57:09 +0100
Subject: [PATCH 2/2] Test that long words are omitted from the search index.

Note they do appear in the 'docs' part of searchindex.json (so they will be
visible in search teasers).
---
 tests/dummy_book/src/first/no-headers.md | 4 +++-
 tests/rendered_output.rs                 | 2 +-
 tests/searchindex_fixture.json           | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/tests/dummy_book/src/first/no-headers.md b/tests/dummy_book/src/first/no-headers.md
index 8f9a6d17..5d799aa6 100644
--- a/tests/dummy_book/src/first/no-headers.md
+++ b/tests/dummy_book/src/first/no-headers.md
@@ -1,3 +1,5 @@
 Capybara capybara capybara.
 
-Capybara capybara capybara.
\ No newline at end of file
+Capybara capybara capybara.
+
+ThisLongWordIsIncludedSoWeCanCheckThatSufficientlyLongWordsAreOmittedFromTheSearchIndex.
diff --git a/tests/rendered_output.rs b/tests/rendered_output.rs
index 873a622d..c6267830 100644
--- a/tests/rendered_output.rs
+++ b/tests/rendered_output.rs
@@ -772,7 +772,7 @@ mod search {
         );
         assert_eq!(
             docs[&no_headers]["body"],
-            "Capybara capybara capybara. Capybara capybara capybara."
+            "Capybara capybara capybara. Capybara capybara capybara. ThisLongWordIsIncludedSoWeCanCheckThatSufficientlyLongWordsAreOmittedFromTheSearchIndex."
         );
     }
 
diff --git a/tests/searchindex_fixture.json b/tests/searchindex_fixture.json
index 9c349b6b..3d7062d2 100644
--- a/tests/searchindex_fixture.json
+++ b/tests/searchindex_fixture.json
@@ -229,7 +229,7 @@
           "title": "Unicode stress tests"
         },
         "18": {
-          "body": "Capybara capybara capybara. Capybara capybara capybara.",
+          "body": "Capybara capybara capybara. Capybara capybara capybara. ThisLongWordIsIncludedSoWeCanCheckThatSufficientlyLongWordsAreOmittedFromTheSearchIndex.",
           "breadcrumbs": "First Chapter » No Headers",
           "id": "18",
           "title": "First Chapter"