Merge pull request #707 from mattico/search-index-opt

Optimize search index
This commit is contained in:
Matt Ickstadt 2018-07-23 12:32:05 -05:00 committed by GitHub
commit da9be67516
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 310 additions and 256 deletions

View File

@ -113,6 +113,7 @@ Available configuration options for the `[output.html.playpen]` table:
Available configuration options for the `[output.html.search]` table: Available configuration options for the `[output.html.search]` table:
- **enable:** Enables the search feature. Defaults to `true`.
- **limit-results:** The maximum number of search results. Defaults to `30`. - **limit-results:** The maximum number of search results. Defaults to `30`.
- **teaser-word-count:** The number of words used for a search result teaser. - **teaser-word-count:** The number of words used for a search result teaser.
Defaults to `30`. Defaults to `30`.
@ -168,6 +169,7 @@ boost-hierarchy = 1
boost-paragraph = 1 boost-paragraph = 1
expand = true expand = true
heading-split-level = 3 heading-split-level = 3
copy-js = true
``` ```

View File

@ -463,9 +463,11 @@ impl Default for Playpen {
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
#[serde(default, rename_all = "kebab-case")] #[serde(default, rename_all = "kebab-case")]
pub struct Search { pub struct Search {
/// Enable the search feature. Default: `true`.
pub enable: bool,
/// Maximum number of visible results. Default: `30`. /// Maximum number of visible results. Default: `30`.
pub limit_results: u32, pub limit_results: u32,
/// The number of words used for a search result teaser. Default: `30`, /// The number of words used for a search result teaser. Default: `30`.
pub teaser_word_count: u32, pub teaser_word_count: u32,
/// Define the logical link between multiple search words. /// Define the logical link between multiple search words.
/// If true, all search words must appear in each result. Default: `true`. /// If true, all search words must appear in each result. Default: `true`.
@ -494,6 +496,7 @@ impl Default for Search {
fn default() -> Search { fn default() -> Search {
// Please update the documentation of `Search` when changing values! // Please update the documentation of `Search` when changing values!
Search { Search {
enable: true,
limit_results: 30, limit_results: 30,
teaser_word_count: 30, teaser_word_count: 30,
use_boolean_and: false, use_boolean_and: false,

View File

@ -367,8 +367,10 @@ impl Renderer for HtmlHandlebars {
.chain_err(|| "Unable to copy across additional CSS and JS")?; .chain_err(|| "Unable to copy across additional CSS and JS")?;
// Render search index // Render search index
#[cfg(feature = "search")] let search = html_config.search.unwrap_or_default();
super::search::create_files(&html_config.search.unwrap_or_default(), &destination, &book)?; if cfg!(feature = "search") && search.enable {
super::search::create_files(&search, &destination, &book)?;
}
// Copy all remaining files // Copy all remaining files
utils::fs::copy_files_except_ext(&src_dir, &destination, true, &["md"])?; utils::fs::copy_files_except_ext(&src_dir, &destination, true, &["md"])?;
@ -446,10 +448,9 @@ fn make_data(
let search = html_config.search.clone(); let search = html_config.search.clone();
if cfg!(feature = "search") { if cfg!(feature = "search") {
data.insert("search_enabled".to_owned(), json!(true)); let search = search.unwrap_or_default();
if search.unwrap_or_default().copy_js { data.insert("search_enabled".to_owned(), json!(search.enable));
data.insert("search_js".to_owned(), json!(true)); data.insert("search_js".to_owned(), json!(search.enable && search.copy_js));
}
} else if search.is_some() { } else if search.is_some() {
warn!("mdBook compiled without search support, ignoring `output.html.search` table"); warn!("mdBook compiled without search support, ignoring `output.html.search` table");
warn!( warn!(

View File

@ -18,16 +18,21 @@ use theme::searcher;
/// Creates all files required for search. /// Creates all files required for search.
pub fn create_files(search_config: &Search, destination: &Path, book: &Book) -> Result<()> { pub fn create_files(search_config: &Search, destination: &Path, book: &Book) -> Result<()> {
let mut index = Index::new(&["title", "body", "breadcrumbs"]); let mut index = Index::new(&["title", "body", "breadcrumbs"]);
let mut doc_urls = Vec::with_capacity(book.sections.len());
for item in book.iter() { for item in book.iter() {
render_item(&mut index, &search_config, item)?; render_item(&mut index, &search_config, &mut doc_urls, item)?;
} }
let index = write_to_js(index, &search_config)?; let index = write_to_json(index, &search_config, doc_urls)?;
debug!("Writing search index ✓"); debug!("Writing search index ✓");
if index.len() > 10_000_000 {
warn!("searchindex.json is very large ({} bytes)", index.len());
}
if search_config.copy_js { if search_config.copy_js {
utils::fs::write_file(destination, "searchindex.js", index.as_bytes())?; utils::fs::write_file(destination, "searchindex.json", index.as_bytes())?;
utils::fs::write_file(destination, "searchindex.js", format!("window.search = {};", index).as_bytes())?;
utils::fs::write_file(destination, "searcher.js", searcher::JS)?; utils::fs::write_file(destination, "searcher.js", searcher::JS)?;
utils::fs::write_file(destination, "mark.min.js", searcher::MARK_JS)?; utils::fs::write_file(destination, "mark.min.js", searcher::MARK_JS)?;
utils::fs::write_file(destination, "elasticlunr.min.js", searcher::ELASTICLUNR_JS)?; utils::fs::write_file(destination, "elasticlunr.min.js", searcher::ELASTICLUNR_JS)?;
@ -38,18 +43,22 @@ pub fn create_files(search_config: &Search, destination: &Path, book: &Book) ->
} }
/// Uses the given arguments to construct a search document, then inserts it to the given index. /// Uses the given arguments to construct a search document, then inserts it to the given index.
fn add_doc<'a>( fn add_doc(
index: &mut Index, index: &mut Index,
anchor_base: &'a str, doc_urls: &mut Vec<String>,
anchor_base: &str,
section_id: &Option<String>, section_id: &Option<String>,
items: &[&str], items: &[&str],
) { ) {
let doc_ref: Cow<'a, str> = if let &Some(ref id) = section_id { let url = if let &Some(ref id) = section_id {
format!("{}#{}", anchor_base, id).into() Cow::Owned(format!("{}#{}", anchor_base, id))
} else { } else {
anchor_base.into() Cow::Borrowed(anchor_base)
}; };
let doc_ref = utils::collapse_whitespace(doc_ref.trim()); let url = utils::collapse_whitespace(url.trim());
let doc_ref = doc_urls.len().to_string();
doc_urls.push(url.into());
let items = items.iter().map(|&x| utils::collapse_whitespace(x.trim())); let items = items.iter().map(|&x| utils::collapse_whitespace(x.trim()));
index.add_doc(&doc_ref, items); index.add_doc(&doc_ref, items);
} }
@ -58,6 +67,7 @@ fn add_doc<'a>(
fn render_item( fn render_item(
index: &mut Index, index: &mut Index,
search_config: &Search, search_config: &Search,
doc_urls: &mut Vec<String>,
item: &BookItem, item: &BookItem,
) -> Result<()> { ) -> Result<()> {
let chapter = match item { let chapter = match item {
@ -92,6 +102,7 @@ fn render_item(
// Write the data to the index, and clear it for the next section // Write the data to the index, and clear it for the next section
add_doc( add_doc(
index, index,
doc_urls,
&anchor_base, &anchor_base,
&section_id, &section_id,
&[&heading, &body, &breadcrumbs.join(" » ")], &[&heading, &body, &breadcrumbs.join(" » ")],
@ -144,6 +155,7 @@ fn render_item(
// Make sure the last section is added to the index // Make sure the last section is added to the index
add_doc( add_doc(
index, index,
doc_urls,
&anchor_base, &anchor_base,
&section_id, &section_id,
&[&heading, &body, &breadcrumbs.join(" » ")], &[&heading, &body, &breadcrumbs.join(" » ")],
@ -153,10 +165,7 @@ fn render_item(
Ok(()) Ok(())
} }
/// Exports the index and search options to a JS script which stores the index in `window.search`. fn write_to_json(index: Index, search_config: &Search, doc_urls: Vec<String>) -> Result<String> {
/// Using a JS script is a workaround for CORS in `file://` URIs. It also removes the need for
/// downloading/parsing JSON in JS.
fn write_to_js(index: Index, search_config: &Search) -> Result<String> {
use std::collections::BTreeMap; use std::collections::BTreeMap;
use self::elasticlunr::config::{SearchBool, SearchOptions, SearchOptionsField}; use self::elasticlunr::config::{SearchBool, SearchOptions, SearchOptionsField};
@ -169,9 +178,11 @@ fn write_to_js(index: Index, search_config: &Search) -> Result<String> {
#[derive(Serialize)] #[derive(Serialize)]
struct SearchindexJson { struct SearchindexJson {
/// The options used for displaying search results /// The options used for displaying search results
resultsoptions: ResultsOptions, results_options: ResultsOptions,
/// The searchoptions for elasticlunr.js /// The searchoptions for elasticlunr.js
searchoptions: SearchOptions, search_options: SearchOptions,
/// Used to lookup a document's URL from an integer document ref.
doc_urls: Vec<String>,
/// The index for elasticlunr.js /// The index for elasticlunr.js
index: elasticlunr::Index, index: elasticlunr::Index,
} }
@ -185,7 +196,7 @@ fn write_to_js(index: Index, search_config: &Search) -> Result<String> {
opt.boost = Some(search_config.boost_hierarchy); opt.boost = Some(search_config.boost_hierarchy);
fields.insert("breadcrumbs".into(), opt); fields.insert("breadcrumbs".into(), opt);
let searchoptions = SearchOptions { let search_options = SearchOptions {
bool: if search_config.use_boolean_and { bool: if search_config.use_boolean_and {
SearchBool::And SearchBool::And
} else { } else {
@ -195,14 +206,15 @@ fn write_to_js(index: Index, search_config: &Search) -> Result<String> {
fields, fields,
}; };
let resultsoptions = ResultsOptions { let results_options = ResultsOptions {
limit_results: search_config.limit_results, limit_results: search_config.limit_results,
teaser_word_count: search_config.teaser_word_count, teaser_word_count: search_config.teaser_word_count,
}; };
let json_contents = SearchindexJson { let json_contents = SearchindexJson {
resultsoptions, results_options,
searchoptions, search_options,
doc_urls,
index, index,
}; };
@ -211,7 +223,7 @@ fn write_to_js(index: Index, search_config: &Search) -> Result<String> {
let json_contents = serde_json::to_value(&json_contents)?; let json_contents = serde_json::to_value(&json_contents)?;
let json_contents = serde_json::to_string(&json_contents)?; let json_contents = serde_json::to_string(&json_contents)?;
Ok(format!("window.search = {};", json_contents)) Ok(json_contents)
} }
fn clean_html(html: &str) -> String { fn clean_html(html: &str) -> String {

View File

@ -27,11 +27,12 @@ window.search = window.search || {};
content = document.getElementById('content'), content = document.getElementById('content'),
searchindex = null, searchindex = null,
resultsoptions = { doc_urls = [],
results_options = {
teaser_word_count: 30, teaser_word_count: 30,
limit_results: 30, limit_results: 30,
}, },
searchoptions = { search_options = {
bool: "AND", bool: "AND",
expand: true, expand: true,
fields: { fields: {
@ -139,7 +140,7 @@ window.search = window.search || {};
teaser_count++; teaser_count++;
// The ?URL_MARK_PARAM= parameter belongs inbetween the page and the #heading-anchor // The ?URL_MARK_PARAM= parameter belongs inbetween the page and the #heading-anchor
var url = result.ref.split("#"); var url = doc_urls[result.ref].split("#");
if (url.length == 1) { // no anchor found if (url.length == 1) { // no anchor found
url.push(""); url.push("");
} }
@ -196,7 +197,7 @@ window.search = window.search || {};
} }
var window_weight = []; var window_weight = [];
var window_size = Math.min(weighted.length, resultsoptions.teaser_word_count); var window_size = Math.min(weighted.length, results_options.teaser_word_count);
var cur_sum = 0; var cur_sum = 0;
for (var wordindex = 0; wordindex < window_size; wordindex++) { for (var wordindex = 0; wordindex < window_size; wordindex++) {
@ -246,11 +247,12 @@ window.search = window.search || {};
return teaser_split.join(''); return teaser_split.join('');
} }
function init() { function init(config) {
resultsoptions = window.search.resultsoptions; results_options = config.results_options;
searchoptions = window.search.searchoptions; search_options = config.search_options;
searchbar_outer = window.search.searchbar_outer; searchbar_outer = config.searchbar_outer;
searchindex = elasticlunr.Index.load(window.search.index); doc_urls = config.doc_urls;
searchindex = elasticlunr.Index.load(config.index);
// Set up events // Set up events
searchicon.addEventListener('click', function(e) { searchIconClickHandler(); }, false); searchicon.addEventListener('click', function(e) { searchIconClickHandler(); }, false);
@ -441,8 +443,8 @@ window.search = window.search || {};
if (searchindex == null) { return; } if (searchindex == null) { return; }
// Do the actual search // Do the actual search
var results = searchindex.search(searchterm, searchoptions); var results = searchindex.search(searchterm, search_options);
var resultcount = Math.min(results.length, resultsoptions.limit_results); var resultcount = Math.min(results.length, results_options.limit_results);
// Display search metrics // Display search metrics
searchresults_header.innerText = formatSearchMetric(resultcount, searchterm); searchresults_header.innerText = formatSearchMetric(resultcount, searchterm);
@ -460,7 +462,16 @@ window.search = window.search || {};
showResults(true); showResults(true);
} }
init(); fetch(path_to_root + 'searchindex.json')
.then(response => response.json())
.then(json => init(json))
.catch(error => { // Try to load searchindex.js if fetch failed
var script = document.createElement('script');
script.src = path_to_root + 'searchindex.js';
script.onload = () => init(window.search);
document.head.appendChild(script);
});
// Exported functions // Exported functions
search.hasFocus = hasFocus; search.hasFocus = hasFocus;
})(window.search); })(window.search);

View File

@ -426,25 +426,39 @@ mod search {
let index = read_book_index(temp.path()); let index = read_book_index(temp.path());
let doc_urls = index["doc_urls"].as_array().unwrap();
let get_doc_ref = |url: &str| -> String {
doc_urls.iter()
.position(|s| s == url)
.unwrap()
.to_string()
};
let first_chapter = get_doc_ref("first/index.html#first-chapter");
let introduction = get_doc_ref("intro.html#introduction");
let some_section = get_doc_ref("first/index.html#some-section");
let summary = get_doc_ref("first/includes.html#summary");
let conclusion = get_doc_ref("conclusion.html#conclusion");
let bodyidx = &index["index"]["index"]["body"]["root"]; let bodyidx = &index["index"]["index"]["body"]["root"];
let textidx = &bodyidx["t"]["e"]["x"]["t"]; let textidx = &bodyidx["t"]["e"]["x"]["t"];
assert_eq!(textidx["df"], 2); assert_eq!(textidx["df"], 2);
assert_eq!(textidx["docs"]["first/index.html#first-chapter"]["tf"], 1.0); assert_eq!(textidx["docs"][&first_chapter]["tf"], 1.0);
assert_eq!(textidx["docs"]["intro.html#introduction"]["tf"], 1.0); assert_eq!(textidx["docs"][&introduction]["tf"], 1.0);
let docs = &index["index"]["documentStore"]["docs"]; let docs = &index["index"]["documentStore"]["docs"];
assert_eq!(docs["first/index.html#first-chapter"]["body"], "more text."); assert_eq!(docs[&first_chapter]["body"], "more text.");
assert_eq!(docs["first/index.html#some-section"]["body"], ""); assert_eq!(docs[&some_section]["body"], "");
assert_eq!( assert_eq!(
docs["first/includes.html#summary"]["body"], docs[&summary]["body"],
"Introduction First Chapter Nested Chapter Includes Recursive Second Chapter Conclusion" "Introduction First Chapter Nested Chapter Includes Recursive Second Chapter Conclusion"
); );
assert_eq!( assert_eq!(
docs["first/includes.html#summary"]["breadcrumbs"], docs[&summary]["breadcrumbs"],
"First Chapter » Summary" "First Chapter » Summary"
); );
assert_eq!( assert_eq!(
docs["conclusion.html#conclusion"]["body"], docs[&conclusion]["body"],
"I put &lt;HTML&gt; in here!" "I put &lt;HTML&gt; in here!"
); );
} }
@ -452,7 +466,7 @@ mod search {
// Setting this to `true` may cause issues with `cargo watch`, // Setting this to `true` may cause issues with `cargo watch`,
// since it may not finish writing the fixture before the tests // since it may not finish writing the fixture before the tests
// are run again. // are run again.
const GENERATE_FIXTURE: bool = true; const GENERATE_FIXTURE: bool = false;
fn get_fixture() -> serde_json::Value { fn get_fixture() -> serde_json::Value {
if GENERATE_FIXTURE { if GENERATE_FIXTURE {
@ -481,7 +495,7 @@ mod search {
// //
// If you're pretty sure you haven't broken anything, change `GENERATE_FIXTURE` // If you're pretty sure you haven't broken anything, change `GENERATE_FIXTURE`
// above to `true`, and run `cargo test` to generate a new fixture. Then // above to `true`, and run `cargo test` to generate a new fixture. Then
// change it back to `false`. Include the changed `searchindex_fixture.json` in your commit. // **change it back to `false`**. Include the changed `searchindex_fixture.json` in your commit.
#[test] #[test]
fn search_index_hasnt_changed_accidentally() { fn search_index_hasnt_changed_accidentally() {
let temp = DummyBook::new().build().unwrap(); let temp = DummyBook::new().build().unwrap();

File diff suppressed because it is too large Load Diff