Render markdown into searchindex for elasticlunr. (feature gated because nightly requirement)

This commit is contained in:
Phaiax 2017-10-03 00:09:10 +02:00
parent 893dc39b60
commit 18a1dc08c8
4 changed files with 149 additions and 11 deletions

View File

@ -29,6 +29,7 @@ toml = "0.4"
open = "1.1"
regex = "0.2.1"
tempdir = "0.3.4"
elasticlunr = { git = "https://github.com/mattico/elasticlunr-rs", optional = true}
# Watch feature
notify = { version = "4.0", optional = true }
@ -55,6 +56,7 @@ output = []
regenerate-css = []
watch = ["notify", "time", "crossbeam"]
serve = ["iron", "staticfile", "ws"]
searchindex = ["elasticlunr"]
[[bin]]
doc = false

View File

@ -88,6 +88,8 @@ extern crate serde_derive;
extern crate serde_json;
extern crate tempdir;
extern crate toml;
#[cfg(feature = "searchindex")]
extern crate elasticlunr;
mod parse;
mod preprocess;

View File

@ -9,6 +9,9 @@ use theme::{Theme, playpen_editor};
use errors::*;
use regex::{Captures, Regex};
#[cfg(feature = "searchindex")]
use elasticlunr;
use std::ascii::AsciiExt;
use std::path::{Path, PathBuf};
use std::fs::{self, File};
@ -31,7 +34,8 @@ impl HtmlHandlebars {
fn render_item(&self,
item: &BookItem,
mut ctx: RenderItemContext,
print_content: &mut String)
print_content: &mut String,
search_documents : &mut Vec<utils::SearchDocument>)
-> Result<()> {
// FIXME: This should be made DRY-er and rely less on mutable state
match *item {
@ -42,6 +46,15 @@ impl HtmlHandlebars {
let content = utils::fs::file_to_string(&path)?;
let base = path.parent()
.ok_or_else(|| String::from("Invalid bookitem path!"))?;
let path = ch.path.to_str().ok_or_else(|| {
io::Error::new(io::ErrorKind::Other, "Could not convert path to str")
})?;
utils::render_markdown_into_searchindex(search_documents,
&content,
path,
&vec![],
id_from_content);
// Parse and expand links
let content = preprocess::links::replace_all(&content, base)?;
@ -49,11 +62,6 @@ impl HtmlHandlebars {
print_content.push_str(&content);
// Update the context with data for this file
let path = ch.path.to_str().ok_or_else(|| {
io::Error::new(io::ErrorKind::Other,
"Could not convert path \
to str")
})?;
// Non-lexical lifetimes needed :'(
let title: String;
@ -264,6 +272,9 @@ impl Renderer for HtmlHandlebars {
// Print version
let mut print_content = String::new();
// Search index
let mut search_documents = vec![];
// TODO: The Renderer trait should really pass in where it wants us to build to...
let destination = book.get_destination();
@ -280,9 +291,12 @@ impl Renderer for HtmlHandlebars {
is_index: i == 0,
html_config: html_config.clone(),
};
self.render_item(item, ctx, &mut print_content)?;
self.render_item(item, ctx, &mut print_content, &mut search_documents)?;
}
// Search index
make_searchindex(book, &search_documents)?;
// Print version
self.configure_print_version(&mut data, &print_content);
if let Some(ref title) = book.config.book.title {
@ -300,7 +314,7 @@ impl Renderer for HtmlHandlebars {
book.write_file(Path::new("print").with_extension("html"),
&rendered.into_bytes())?;
info!("[*] Creating print.html ✓");
info!("[*] Creating \"print.html\"");
// Copy static files (js, css, images, ...)
debug!("[*] Copy static files");
@ -619,6 +633,26 @@ pub fn normalize_id(content: &str) -> String {
.collect::<String>()
}
#[cfg(not(feature = "searchindex"))]
fn make_searchindex(_book: &MDBook, _search_documents : &Vec<utils::SearchDocument>) -> Result<()> {
Ok(())
}
#[cfg(feature = "searchindex")]
fn make_searchindex(book: &MDBook, search_documents : &Vec<utils::SearchDocument>) -> Result<()> {
let mut index = elasticlunr::IndexBuilder::new();
for sd in search_documents {
index.add_document(&sd.title, &sd.body);
}
book.write_file(
Path::new("searchindex").with_extension("json"),
&index.to_json().as_bytes(),
)?;
info!("[*] Creating \"searchindex.json\"");
Ok(())
}
#[cfg(test)]
mod tests {

View File

@ -3,7 +3,107 @@ pub mod fs;
use pulldown_cmark::{html, Event, Options, Parser, Tag, OPTION_ENABLE_FOOTNOTES,
OPTION_ENABLE_TABLES};
use std::borrow::Cow;
use std::fmt::Write;
use regex::Regex;
use std::rc::Rc;
/// A heading together with the successive content until the next heading will
/// make up one `SearchDocument`. It represents some independently searchable part of the book.
#[derive(Default, Debug)]
pub struct SearchDocument {
// Corresponding heading
pub title : String,
// Content: Flatted paragraphs, lists, code
pub body : String,
/// Needed information to generate a link to the corresponding title anchor
/// First part is the `reference_base` that should be the same for all documents that
/// came from the same `.md` file. The second part is derived from the heading of the search
/// document.
pub sref : (Rc<String>, Option<String>),
// Breadcrumbs like ["Main Chapter Title", "Sub Chapter Title", "H1 Heading"]
// as a human understandable path to the search document.
pub breadcrumbs : Vec<Rc<String>>,
}
impl SearchDocument {
fn new(sref0 : &Rc<String>, bcs : &Vec<Rc<String>>) -> SearchDocument {
SearchDocument {
title : "".to_owned(),
body : "".to_owned(),
sref : (sref0.clone(), None),
breadcrumbs : bcs.clone()
}
}
fn has_content(&self) -> bool {
self.title.len() > 0
}
fn add(&mut self, text : &str, to_title : bool) {
if to_title {
self.title.write_str(&text).unwrap();
} else {
self.body.write_str(&text).unwrap();
self.body.write_str(&" ").unwrap();
}
}
}
/// Renders markdown into flat unformatted text for usage in the search index.
/// Refer to the struct `SearchDocument`.
///
/// The field `sref` in the `SearchDocument` struct becomes
/// `(reference_base, Some(heading_to_sref("The Section Heading")))`
pub fn render_markdown_into_searchindex<F>(
search_documents: &mut Vec<SearchDocument>,
text: &str,
reference_base: &str,
breadcrumbs : &Vec<Rc<String>>,
heading_to_sref : F)
where F : Fn(&str) -> String {
let mut opts = Options::empty();
opts.insert(OPTION_ENABLE_TABLES);
opts.insert(OPTION_ENABLE_FOOTNOTES);
let p = Parser::new_ext(text, opts);
let reference_base = Rc::new(reference_base.to_owned());
let mut current = SearchDocument::new(&reference_base, breadcrumbs);
let mut in_header = false;
for event in p {
match event {
Event::Start(Tag::Header(i)) if i <= 3 => {
if current.has_content() {
search_documents.push(current);
}
current = SearchDocument::new(&reference_base, breadcrumbs);
in_header = true;
}
Event::End(Tag::Header(_)) => {
// Possible extension: Use h1,h2,h3 as hierarchy for the breadcrumbs
current.breadcrumbs.push(Rc::new(current.title.clone()));
current.sref.1 = Some(heading_to_sref(&current.title));
in_header = false;
}
Event::Start(_) | Event::End(_) => {}
Event::Text(text) => {
current.add(&text, in_header);
}
Event::Html(html) | Event::InlineHtml(html) => {
current.body.write_str(&trim_html_tags(&html)).unwrap();
}
Event::FootnoteReference(_) => {}
Event::SoftBreak | Event::HardBreak => {}
}
}
search_documents.push(current);
}
fn trim_html_tags<'a>(text : &'a str) -> Cow<'a, str> {
let regex = Regex::new(r"<[^>]*?>").unwrap();
regex.replace_all(text, "")
}
///
///