From 42e635bb9eed5c58ff271170015e9296ce8eceac Mon Sep 17 00:00:00 2001 From: Eric Huss Date: Mon, 5 Feb 2024 14:11:27 -0800 Subject: [PATCH] Update pulldown_cmark to 0.10 --- Cargo.lock | 13 +- Cargo.toml | 2 +- src/book/summary.rs | 55 ++++-- src/renderer/html_handlebars/search.rs | 54 ++++-- src/utils/mod.rs | 35 +++- tests/dummy_book/src/conclusion.md | 4 + tests/rendered_output.rs | 11 +- tests/searchindex_fixture.json | 246 ++++++++++++++++++++++++- 8 files changed, 366 insertions(+), 54 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 66a3a493..906fa56c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1337,15 +1337,22 @@ dependencies = [ [[package]] name = "pulldown-cmark" -version = "0.9.3" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77a1a2f1f0a7ecff9c31abbe177637be0e97a0aef46cf8738ece09327985d998" +checksum = "dce76ce678ffc8e5675b22aa1405de0b7037e2fdf8913fea40d1926c6fe1e6e7" dependencies = [ - "bitflags 1.3.2", + "bitflags 2.4.1", "memchr", + "pulldown-cmark-escape", "unicase", ] +[[package]] +name = "pulldown-cmark-escape" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d5d8f9aa0e3cbcfaf8bf00300004ee3b72f74770f9cbac93f6928771f613276b" + [[package]] name = "quote" version = "1.0.33" diff --git a/Cargo.toml b/Cargo.toml index 9a6e7769..8e370ace 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -27,7 +27,7 @@ handlebars = "5.0" log = "0.4.17" memchr = "2.5.0" opener = "0.6.1" -pulldown-cmark = { version = "0.9.3", default-features = false } +pulldown-cmark = { version = "0.10.0", default-features = false, features = ["html"] } regex = "1.8.1" serde = { version = "1.0.163", features = ["derive"] } serde_json = "1.0.96" diff --git a/src/book/summary.rs b/src/book/summary.rs index b2784ce5..ed8f8337 100644 --- a/src/book/summary.rs +++ b/src/book/summary.rs @@ -1,7 +1,7 @@ use crate::errors::*; use log::{debug, trace, warn}; -use memchr::{self, Memchr}; -use pulldown_cmark::{self, Event, HeadingLevel, Tag}; +use memchr::Memchr; +use pulldown_cmark::{DefaultBrokenLinkCallback, Event, HeadingLevel, Tag, TagEnd}; use serde::{Deserialize, Serialize}; use std::fmt::{self, Display, Formatter}; use std::iter::FromIterator; @@ -163,7 +163,7 @@ impl From for SummaryItem { /// > match the following regex: "[^<>\n[]]+". struct SummaryParser<'a> { src: &'a str, - stream: pulldown_cmark::OffsetIter<'a, 'a>, + stream: pulldown_cmark::OffsetIter<'a, DefaultBrokenLinkCallback>, offset: usize, /// We can't actually put an event back into the `OffsetIter` stream, so instead we store it @@ -210,7 +210,7 @@ macro_rules! collect_events { } impl<'a> SummaryParser<'a> { - fn new(text: &str) -> SummaryParser<'_> { + fn new(text: &'a str) -> SummaryParser<'a> { let pulldown_parser = pulldown_cmark::Parser::new(text).into_offset_iter(); SummaryParser { @@ -265,7 +265,12 @@ impl<'a> SummaryParser<'a> { loop { match self.next_event() { Some(ev @ Event::Start(Tag::List(..))) - | Some(ev @ Event::Start(Tag::Heading(HeadingLevel::H1, ..))) => { + | Some( + ev @ Event::Start(Tag::Heading { + level: HeadingLevel::H1, + .. + }), + ) => { if is_prefix { // we've finished prefix chapters and are at the start // of the numbered section. @@ -275,8 +280,8 @@ impl<'a> SummaryParser<'a> { bail!(self.parse_error("Suffix chapters cannot be followed by a list")); } } - Some(Event::Start(Tag::Link(_type, href, _title))) => { - let link = self.parse_link(href.to_string()); + Some(Event::Start(Tag::Link { dest_url, .. })) => { + let link = self.parse_link(dest_url.to_string()); items.push(SummaryItem::Link(link)); } Some(Event::Rule) => items.push(SummaryItem::Separator), @@ -304,10 +309,13 @@ impl<'a> SummaryParser<'a> { break; } - Some(Event::Start(Tag::Heading(HeadingLevel::H1, ..))) => { + Some(Event::Start(Tag::Heading { + level: HeadingLevel::H1, + .. + })) => { debug!("Found a h1 in the SUMMARY"); - let tags = collect_events!(self.stream, end Tag::Heading(HeadingLevel::H1, ..)); + let tags = collect_events!(self.stream, end TagEnd::Heading(HeadingLevel::H1)); Some(stringify_events(tags)) } @@ -336,7 +344,7 @@ impl<'a> SummaryParser<'a> { /// Finishes parsing a link once the `Event::Start(Tag::Link(..))` has been opened. fn parse_link(&mut self, href: String) -> Link { let href = href.replace("%20", " "); - let link_content = collect_events!(self.stream, end Tag::Link(..)); + let link_content = collect_events!(self.stream, end TagEnd::Link); let name = stringify_events(link_content); let path = if href.is_empty() { @@ -377,7 +385,12 @@ impl<'a> SummaryParser<'a> { } // The expectation is that pulldown cmark will terminate a paragraph before a new // heading, so we can always count on this to return without skipping headings. - Some(ev @ Event::Start(Tag::Heading(HeadingLevel::H1, ..))) => { + Some( + ev @ Event::Start(Tag::Heading { + level: HeadingLevel::H1, + .. + }), + ) => { // we're starting a new part self.back(ev); break; @@ -398,7 +411,7 @@ impl<'a> SummaryParser<'a> { // Skip over the contents of this tag while let Some(event) = self.next_event() { - if event == Event::End(other_tag.clone()) { + if event == Event::End(other_tag.clone().into()) { break; } } @@ -469,7 +482,7 @@ impl<'a> SummaryParser<'a> { last_item.nested_items = sub_items; } - Some(Event::End(Tag::List(..))) => break, + Some(Event::End(TagEnd::List(..))) => break, Some(_) => {} None => break, } @@ -486,8 +499,8 @@ impl<'a> SummaryParser<'a> { loop { match self.next_event() { Some(Event::Start(Tag::Paragraph)) => continue, - Some(Event::Start(Tag::Link(_type, href, _title))) => { - let mut link = self.parse_link(href.to_string()); + Some(Event::Start(Tag::Link { dest_url, .. })) => { + let mut link = self.parse_link(dest_url.to_string()); let mut number = parent.clone(); number.0.push(num_existing_items as u32 + 1); @@ -529,14 +542,18 @@ impl<'a> SummaryParser<'a> { fn parse_title(&mut self) -> Option { loop { match self.next_event() { - Some(Event::Start(Tag::Heading(HeadingLevel::H1, ..))) => { + Some(Event::Start(Tag::Heading { + level: HeadingLevel::H1, + .. + })) => { debug!("Found a h1 in the SUMMARY"); - let tags = collect_events!(self.stream, end Tag::Heading(HeadingLevel::H1, ..)); + let tags = collect_events!(self.stream, end TagEnd::Heading(HeadingLevel::H1)); return Some(stringify_events(tags)); } // Skip a HTML element such as a comment line. - Some(Event::Html(_)) => {} + Some(Event::Html(_) | Event::InlineHtml(_)) + | Some(Event::Start(Tag::HtmlBlock) | Event::End(TagEnd::HtmlBlock)) => {} // Otherwise, no title. Some(ev) => { self.back(ev); @@ -744,7 +761,7 @@ mod tests { let _ = parser.stream.next(); // Discard opening paragraph let href = match parser.stream.next() { - Some((Event::Start(Tag::Link(_type, href, _title)), _range)) => href.to_string(), + Some((Event::Start(Tag::Link { dest_url, .. }), _range)) => dest_url.to_string(), other => panic!("Unreachable, {:?}", other), }; diff --git a/src/renderer/html_handlebars/search.rs b/src/renderer/html_handlebars/search.rs index 24d62fda..1144c9f5 100644 --- a/src/renderer/html_handlebars/search.rs +++ b/src/renderer/html_handlebars/search.rs @@ -66,10 +66,23 @@ fn add_doc( index: &mut Index, doc_urls: &mut Vec, anchor_base: &str, - section_id: &Option, + heading: &str, + id_counter: &mut HashMap, + section_id: &Option>, items: &[&str], ) { - let url = if let Some(ref id) = *section_id { + // Either use the explicit section id the user specified, or generate one + // from the heading content. + let section_id = section_id.as_ref().map(|id| id.to_string()).or_else(|| { + if heading.is_empty() { + // In the case where a chapter has no heading, don't set a section id. + None + } else { + Some(utils::unique_id_from_content(heading, id_counter)) + } + }); + + let url = if let Some(id) = section_id { Cow::Owned(format!("{}#{}", anchor_base, id)) } else { Cow::Borrowed(anchor_base) @@ -119,7 +132,7 @@ fn render_item( let mut id_counter = HashMap::new(); while let Some(event) = p.next() { match event { - Event::Start(Tag::Heading(i, ..)) if i as u32 <= max_section_depth => { + Event::Start(Tag::Heading { level, id, .. }) if level as u32 <= max_section_depth => { if !heading.is_empty() { // Section finished, the next heading is following now // Write the data to the index, and clear it for the next section @@ -127,22 +140,21 @@ fn render_item( index, doc_urls, &anchor_base, + &heading, + &mut id_counter, §ion_id, &[&heading, &body, &breadcrumbs.join(" » ")], ); - section_id = None; heading.clear(); body.clear(); breadcrumbs.pop(); } + section_id = id; in_heading = true; } - Event::End(Tag::Heading(i, id, _classes)) if i as u32 <= max_section_depth => { + Event::End(TagEnd::Heading(level)) if level as u32 <= max_section_depth => { in_heading = false; - section_id = id - .map(|id| id.to_string()) - .or_else(|| Some(utils::unique_id_from_content(&heading, &mut id_counter))); breadcrumbs.push(heading.clone()); } Event::Start(Tag::FootnoteDefinition(name)) => { @@ -159,9 +171,19 @@ fn render_item( html_block.push_str(html); p.next(); } - body.push_str(&clean_html(&html_block)); } + Event::InlineHtml(html) => { + // This is not capable of cleaning inline tags like + // `foo `. The `. + +But regular inline is indexed. diff --git a/tests/rendered_output.rs b/tests/rendered_output.rs index 7626b9e8..a01ce5f4 100644 --- a/tests/rendered_output.rs +++ b/tests/rendered_output.rs @@ -375,10 +375,7 @@ fn able_to_include_playground_files_in_chapters() { let second = temp.path().join("book/second.html"); - let playground_strings = &[ - r#"class="playground""#, - r#"println!("Hello World!");"#, - ]; + let playground_strings = &[r#"class="playground""#, r#"println!("Hello World!");"#]; assert_contains_strings(&second, playground_strings); assert_doesnt_contain_strings(&second, &["{{#playground example.rs}}"]); @@ -745,6 +742,7 @@ mod search { let index = read_book_index(temp.path()); let doc_urls = index["doc_urls"].as_array().unwrap(); + eprintln!("doc_urls={doc_urls:#?}",); let get_doc_ref = |url: &str| -> String { doc_urls.iter().position(|s| s == url).unwrap().to_string() }; @@ -774,7 +772,10 @@ mod search { docs[&summary]["breadcrumbs"], "First Chapter » Includes » Summary" ); - assert_eq!(docs[&conclusion]["body"], "I put <HTML> in here!"); + // See note about InlineHtml in search.rs. Ideally the `alert()` part + // should not be in the index, but we don't have a way to scrub inline + // html. + assert_eq!(docs[&conclusion]["body"], "I put <HTML> in here! Sneaky inline event alert(\"inline\");. But regular inline is indexed."); assert_eq!( docs[&no_headers]["breadcrumbs"], "First Chapter » No Headers" diff --git a/tests/searchindex_fixture.json b/tests/searchindex_fixture.json index 85463028..06bfb2ab 100644 --- a/tests/searchindex_fixture.json +++ b/tests/searchindex_fixture.json @@ -145,7 +145,7 @@ "title": 1 }, "29": { - "body": 3, + "body": 10, "breadcrumbs": 2, "title": 1 }, @@ -319,7 +319,7 @@ "title": "Some section" }, "29": { - "body": "I put <HTML> in here!", + "body": "I put <HTML> in here! Sneaky inline event alert(\"inline\");. But regular inline is indexed.", "breadcrumbs": "Conclusion » Conclusion", "id": "29", "title": "Conclusion" @@ -412,6 +412,54 @@ }, "df": 0, "docs": {}, + "l": { + "df": 0, + "docs": {}, + "e": { + "df": 0, + "docs": {}, + "r": { + "df": 0, + "docs": {}, + "t": { + "(": { + "\"": { + "df": 0, + "docs": {}, + "i": { + "df": 0, + "docs": {}, + "n": { + "df": 0, + "docs": {}, + "l": { + "df": 0, + "docs": {}, + "i": { + "df": 0, + "docs": {}, + "n": { + "df": 1, + "docs": { + "29": { + "tf": 1.0 + } + } + } + } + } + } + } + }, + "df": 0, + "docs": {} + }, + "df": 0, + "docs": {} + } + } + } + }, "n": { "c": { "df": 0, @@ -1212,6 +1260,14 @@ "26": { "tf": 1.0 } + }, + "t": { + "df": 1, + "docs": { + "29": { + "tf": 1.0 + } + } } } } @@ -1684,10 +1740,13 @@ "df": 0, "docs": {}, "x": { - "df": 1, + "df": 2, "docs": { "0": { "tf": 1.0 + }, + "29": { + "tf": 1.0 } } } @@ -1695,6 +1754,22 @@ }, "df": 0, "docs": {}, + "l": { + "df": 0, + "docs": {}, + "i": { + "df": 0, + "docs": {}, + "n": { + "df": 1, + "docs": { + "29": { + "tf": 1.4142135623730951 + } + } + } + } + }, "s": { "df": 0, "docs": {}, @@ -2359,6 +2434,30 @@ }, "df": 0, "docs": {}, + "g": { + "df": 0, + "docs": {}, + "u": { + "df": 0, + "docs": {}, + "l": { + "a": { + "df": 0, + "docs": {}, + "r": { + "df": 1, + "docs": { + "29": { + "tf": 1.0 + } + } + } + }, + "df": 0, + "docs": {} + } + } + }, "l": { "df": 1, "docs": { @@ -2590,6 +2689,26 @@ "n": { "df": 0, "docs": {}, + "e": { + "a": { + "df": 0, + "docs": {}, + "k": { + "df": 0, + "docs": {}, + "i": { + "df": 1, + "docs": { + "29": { + "tf": 1.0 + } + } + } + } + }, + "df": 0, + "docs": {} + }, "i": { "df": 0, "docs": {}, @@ -3252,6 +3371,54 @@ }, "df": 0, "docs": {}, + "l": { + "df": 0, + "docs": {}, + "e": { + "df": 0, + "docs": {}, + "r": { + "df": 0, + "docs": {}, + "t": { + "(": { + "\"": { + "df": 0, + "docs": {}, + "i": { + "df": 0, + "docs": {}, + "n": { + "df": 0, + "docs": {}, + "l": { + "df": 0, + "docs": {}, + "i": { + "df": 0, + "docs": {}, + "n": { + "df": 1, + "docs": { + "29": { + "tf": 1.0 + } + } + } + } + } + } + } + }, + "df": 0, + "docs": {} + }, + "df": 0, + "docs": {} + } + } + } + }, "n": { "c": { "df": 0, @@ -4130,6 +4297,14 @@ "26": { "tf": 1.0 } + }, + "t": { + "df": 1, + "docs": { + "29": { + "tf": 1.0 + } + } } } } @@ -4665,10 +4840,13 @@ "df": 0, "docs": {}, "x": { - "df": 1, + "df": 2, "docs": { "0": { "tf": 1.0 + }, + "29": { + "tf": 1.0 } } } @@ -4676,6 +4854,22 @@ }, "df": 0, "docs": {}, + "l": { + "df": 0, + "docs": {}, + "i": { + "df": 0, + "docs": {}, + "n": { + "df": 1, + "docs": { + "29": { + "tf": 1.4142135623730951 + } + } + } + } + }, "s": { "df": 0, "docs": {}, @@ -5373,6 +5567,30 @@ }, "df": 0, "docs": {}, + "g": { + "df": 0, + "docs": {}, + "u": { + "df": 0, + "docs": {}, + "l": { + "a": { + "df": 0, + "docs": {}, + "r": { + "df": 1, + "docs": { + "29": { + "tf": 1.0 + } + } + } + }, + "df": 0, + "docs": {} + } + } + }, "l": { "df": 1, "docs": { @@ -5610,6 +5828,26 @@ "n": { "df": 0, "docs": {}, + "e": { + "a": { + "df": 0, + "docs": {}, + "k": { + "df": 0, + "docs": {}, + "i": { + "df": 1, + "docs": { + "29": { + "tf": 1.0 + } + } + } + } + }, + "df": 0, + "docs": {} + }, "i": { "df": 0, "docs": {},