mdBook/src/utils/mod.rs

495 lines
16 KiB
Rust
Raw Normal View History

#![allow(missing_docs)] // FIXME: Document this
pub mod fs;
mod string;
pub(crate) mod toml_ext;
use crate::errors::Error;
use regex::Regex;
use pulldown_cmark::{html, CodeBlockKind, CowStr, Event, Options, Parser, Tag};
use std::borrow::Cow;
use std::collections::HashMap;
2019-07-01 23:52:25 +08:00
use std::fmt::Write;
use std::path::Path;
pub use self::string::{
take_anchored_lines, take_lines, take_rustdoc_include_anchored_lines,
take_rustdoc_include_lines,
};
/// Replaces multiple consecutive whitespace characters with a single space character.
pub fn collapse_whitespace(text: &str) -> Cow<'_, str> {
lazy_static! {
static ref RE: Regex = Regex::new(r"\s\s+").unwrap();
}
RE.replace_all(text, " ")
}
/// Convert the given string to a valid HTML element ID.
/// The only restriction is that the ID must not contain any ASCII whitespace.
pub fn normalize_id(content: &str) -> String {
content
.chars()
.filter_map(|ch| {
if ch.is_alphanumeric() || ch == '_' || ch == '-' {
Some(ch.to_ascii_lowercase())
} else if ch.is_whitespace() {
Some('-')
} else {
None
}
2019-05-05 22:57:43 +08:00
})
.collect::<String>()
}
/// Generate an ID for use with anchors which is derived from a "normalised"
/// string.
// This function should be made private when the deprecation expires.
#[deprecated(since = "0.4.16", note = "use unique_id_from_content instead")]
pub fn id_from_content(content: &str) -> String {
let mut content = content.to_string();
// Skip any tags or html-encoded stuff
2021-11-10 05:41:44 +08:00
lazy_static! {
static ref HTML: Regex = Regex::new(r"(<.*?>)").unwrap();
}
content = HTML.replace_all(&content, "").into();
const REPL_SUB: &[&str] = &["&lt;", "&gt;", "&amp;", "&#39;", "&quot;"];
for sub in REPL_SUB {
content = content.replace(sub, "");
}
// Remove spaces and hashes indicating a header
let trimmed = content.trim().trim_start_matches('#').trim();
normalize_id(trimmed)
}
/// Generate an ID for use with anchors which is derived from a "normalised"
/// string.
///
/// Each ID returned will be unique, if the same `id_counter` is provided on
/// each call.
pub fn unique_id_from_content(content: &str, id_counter: &mut HashMap<String, usize>) -> String {
let id = {
#[allow(deprecated)]
id_from_content(content)
};
// If we have headers with the same normalized id, append an incrementing counter
let id_count = id_counter.entry(id.clone()).or_insert(0);
let unique_id = match *id_count {
0 => id,
id_count => format!("{}-{}", id, id_count),
};
*id_count += 1;
unique_id
}
2019-07-01 23:52:25 +08:00
/// Fix links to the correct location.
///
/// This adjusts links, such as turning `.md` extensions to `.html`.
///
/// `path` is the path to the page being rendered relative to the root of the
/// book. This is used for the `print.html` page so that links on the print
/// page go to the original location. Normal page rendering sets `path` to
/// None. Ideally, print page links would link to anchors on the print page,
/// but that is very difficult.
fn adjust_links<'a>(event: Event<'a>, path: Option<&Path>) -> Event<'a> {
lazy_static! {
2019-05-09 05:50:59 +08:00
static ref SCHEME_LINK: Regex = Regex::new(r"^[a-z][a-z0-9+.-]*:").unwrap();
static ref MD_LINK: Regex = Regex::new(r"(?P<link>.*)\.md(?P<anchor>#.*)?").unwrap();
}
2019-07-01 23:52:25 +08:00
fn fix<'a>(dest: CowStr<'a>, path: Option<&Path>) -> CowStr<'a> {
if dest.starts_with('#') {
// Fragment-only link.
if let Some(path) = path {
let mut base = path.display().to_string();
if base.ends_with(".md") {
base.replace_range(base.len() - 3.., ".html");
}
return format!("{}{}", base, dest).into();
} else {
return dest;
}
}
2019-05-09 05:50:59 +08:00
// Don't modify links with schemes like `https`.
if !SCHEME_LINK.is_match(&dest) {
// This is a relative link, adjust it as necessary.
let mut fixed_link = String::new();
2019-07-01 23:52:25 +08:00
if let Some(path) = path {
let base = path
.parent()
.expect("path can't be empty")
.to_str()
.expect("utf-8 paths only");
if !base.is_empty() {
write!(fixed_link, "{}/", base).unwrap();
}
2019-05-09 05:50:59 +08:00
}
2019-05-09 05:50:59 +08:00
if let Some(caps) = MD_LINK.captures(&dest) {
fixed_link.push_str(&caps["link"]);
fixed_link.push_str(".html");
if let Some(anchor) = caps.name("anchor") {
fixed_link.push_str(anchor.as_str());
}
2019-05-09 05:50:59 +08:00
} else {
fixed_link.push_str(&dest);
};
return CowStr::from(fixed_link);
2019-05-09 05:50:59 +08:00
}
dest
}
2019-07-01 23:52:25 +08:00
fn fix_html<'a>(html: CowStr<'a>, path: Option<&Path>) -> CowStr<'a> {
// This is a terrible hack, but should be reasonably reliable. Nobody
// should ever parse a tag with a regex. However, there isn't anything
// in Rust that I know of that is suitable for handling partial html
// fragments like those generated by pulldown_cmark.
//
// There are dozens of HTML tags/attributes that contain paths, so
// feel free to add more tags if desired; these are the only ones I
// care about right now.
lazy_static! {
static ref HTML_LINK: Regex =
Regex::new(r#"(<(?:a|img) [^>]*?(?:src|href)=")([^"]+?)""#).unwrap();
}
HTML_LINK
.replace_all(&html, |caps: &regex::Captures<'_>| {
let fixed = fix(caps[2].into(), path);
format!("{}{}\"", &caps[1], fixed)
})
.into_owned()
.into()
}
2019-05-09 05:50:59 +08:00
match event {
Event::Start(Tag::Link(link_type, dest, title)) => {
2019-07-01 23:52:25 +08:00
Event::Start(Tag::Link(link_type, fix(dest, path), title))
2019-05-09 05:50:59 +08:00
}
Event::Start(Tag::Image(link_type, dest, title)) => {
2019-07-01 23:52:25 +08:00
Event::Start(Tag::Image(link_type, fix(dest, path), title))
2018-07-24 01:45:01 +08:00
}
2019-07-01 23:52:25 +08:00
Event::Html(html) => Event::Html(fix_html(html, path)),
2018-07-24 01:45:01 +08:00
_ => event,
}
}
/// Wrapper around the pulldown-cmark parser for rendering markdown to HTML.
pub fn render_markdown(text: &str, curly_quotes: bool) -> String {
2019-07-01 23:52:25 +08:00
render_markdown_with_path(text, curly_quotes, None)
2019-01-16 02:19:10 +08:00
}
2021-12-25 20:46:27 +08:00
pub fn new_cmark_parser(text: &str, curly_quotes: bool) -> Parser<'_, '_> {
let mut opts = Options::empty();
opts.insert(Options::ENABLE_TABLES);
opts.insert(Options::ENABLE_FOOTNOTES);
opts.insert(Options::ENABLE_STRIKETHROUGH);
opts.insert(Options::ENABLE_TASKLISTS);
2021-11-20 09:26:30 +08:00
if curly_quotes {
opts.insert(Options::ENABLE_SMART_PUNCTUATION);
}
Parser::new_ext(text, opts)
}
2019-07-01 23:52:25 +08:00
pub fn render_markdown_with_path(text: &str, curly_quotes: bool, path: Option<&Path>) -> String {
let mut s = String::with_capacity(text.len() * 3 / 2);
2021-11-20 09:26:30 +08:00
let p = new_cmark_parser(text, curly_quotes);
let events = p
.map(clean_codeblock_headers)
.map(|event| adjust_links(event, path))
.flat_map(|event| {
let (a, b) = wrap_tables(event);
a.into_iter().chain(b)
});
html::push_html(&mut s, events);
s
}
/// Wraps tables in a `.table-wrapper` class to apply overflow-x rules to.
fn wrap_tables(event: Event<'_>) -> (Option<Event<'_>>, Option<Event<'_>>) {
match event {
Event::Start(Tag::Table(_)) => (
Some(Event::Html(r#"<div class="table-wrapper">"#.into())),
Some(event),
),
Event::End(Tag::Table(_)) => (Some(event), Some(Event::Html(r#"</div>"#.into()))),
_ => (Some(event), None),
}
}
fn clean_codeblock_headers(event: Event<'_>) -> Event<'_> {
match event {
Event::Start(Tag::CodeBlock(CodeBlockKind::Fenced(ref info))) => {
let info: String = info
.chars()
.map(|x| match x {
' ' | '\t' => ',',
_ => x,
})
.filter(|ch| !ch.is_whitespace())
.collect();
Event::Start(Tag::CodeBlock(CodeBlockKind::Fenced(CowStr::from(info))))
}
_ => event,
}
}
/// Prints a "backtrace" of some `Error`.
pub fn log_backtrace(e: &Error) {
error!("Error: {}", e);
for cause in e.chain().skip(1) {
error!("\tCaused By: {}", cause);
}
}
pub(crate) fn bracket_escape(mut s: &str) -> String {
let mut escaped = String::with_capacity(s.len());
let needs_escape: &[char] = &['<', '>'];
while let Some(next) = s.find(needs_escape) {
escaped.push_str(&s[..next]);
match s.as_bytes()[next] {
b'<' => escaped.push_str("&lt;"),
b'>' => escaped.push_str("&gt;"),
_ => unreachable!(),
}
s = &s[next + 1..];
}
escaped.push_str(s);
escaped
}
#[cfg(test)]
mod tests {
use super::bracket_escape;
mod render_markdown {
use super::super::render_markdown;
#[test]
fn preserves_external_links() {
2018-07-24 01:45:01 +08:00
assert_eq!(
render_markdown("[example](https://www.rust-lang.org/)", false),
"<p><a href=\"https://www.rust-lang.org/\">example</a></p>\n"
);
}
#[test]
fn it_can_adjust_markdown_links() {
2018-07-24 01:45:01 +08:00
assert_eq!(
render_markdown("[example](example.md)", false),
"<p><a href=\"example.html\">example</a></p>\n"
);
assert_eq!(
render_markdown("[example_anchor](example.md#anchor)", false),
"<p><a href=\"example.html#anchor\">example_anchor</a></p>\n"
);
// this anchor contains 'md' inside of it
assert_eq!(
render_markdown("[phantom data](foo.html#phantomdata)", false),
"<p><a href=\"foo.html#phantomdata\">phantom data</a></p>\n"
);
}
2021-07-30 16:17:02 +08:00
#[test]
fn it_can_wrap_tables() {
let src = r#"
| Original | Punycode | Punycode + Encoding |
|-----------------|-----------------|---------------------|
| føø | f-5gaa | f_5gaa |
"#;
let out = r#"
<div class="table-wrapper"><table><thead><tr><th>Original</th><th>Punycode</th><th>Punycode + Encoding</th></tr></thead><tbody>
<tr><td>føø</td><td>f-5gaa</td><td>f_5gaa</td></tr>
</tbody></table>
</div>
"#.trim();
assert_eq!(render_markdown(src, false), out);
}
#[test]
fn it_can_keep_quotes_straight() {
assert_eq!(render_markdown("'one'", false), "<p>'one'</p>\n");
}
#[test]
fn it_can_make_quotes_curly_except_when_they_are_in_code() {
let input = r#"
'one'
```
'two'
```
`'three'` 'four'"#;
let expected = r#"<p>one</p>
<pre><code>'two'
</code></pre>
<p><code>'three'</code> four</p>
"#;
assert_eq!(render_markdown(input, true), expected);
}
#[test]
fn whitespace_outside_of_codeblock_header_is_preserved() {
let input = r#"
some text with spaces
```rust
fn main() {
// code inside is unchanged
}
```
more text with spaces
"#;
let expected = r#"<p>some text with spaces</p>
<pre><code class="language-rust">fn main() {
// code inside is unchanged
}
</code></pre>
<p>more text with spaces</p>
"#;
assert_eq!(render_markdown(input, false), expected);
assert_eq!(render_markdown(input, true), expected);
}
#[test]
fn rust_code_block_properties_are_passed_as_space_delimited_class() {
let input = r#"
```rust,no_run,should_panic,property_3
```
"#;
let expected = r#"<pre><code class="language-rust,no_run,should_panic,property_3"></code></pre>
"#;
assert_eq!(render_markdown(input, false), expected);
assert_eq!(render_markdown(input, true), expected);
}
#[test]
fn rust_code_block_properties_with_whitespace_are_passed_as_space_delimited_class() {
let input = r#"
```rust, no_run,,,should_panic , ,property_3
```
"#;
2021-02-21 14:28:16 +08:00
let expected = r#"<pre><code class="language-rust,,,,,no_run,,,should_panic,,,,property_3"></code></pre>
"#;
assert_eq!(render_markdown(input, false), expected);
assert_eq!(render_markdown(input, true), expected);
}
#[test]
fn rust_code_block_without_properties_has_proper_html_class() {
let input = r#"
```rust
```
"#;
let expected = r#"<pre><code class="language-rust"></code></pre>
"#;
assert_eq!(render_markdown(input, false), expected);
assert_eq!(render_markdown(input, true), expected);
let input = r#"
```rust
```
"#;
assert_eq!(render_markdown(input, false), expected);
assert_eq!(render_markdown(input, true), expected);
}
}
#[allow(deprecated)]
mod id_from_content {
use super::super::id_from_content;
#[test]
fn it_generates_anchors() {
assert_eq!(
id_from_content("## Method-call expressions"),
"method-call-expressions"
);
assert_eq!(id_from_content("## **Bold** title"), "bold-title");
assert_eq!(id_from_content("## `Code` title"), "code-title");
2021-11-10 05:41:44 +08:00
assert_eq!(
id_from_content("## title <span dir=rtl>foo</span>"),
"title-foo"
);
}
#[test]
fn it_generates_anchors_from_non_ascii_initial() {
2018-07-24 01:45:01 +08:00
assert_eq!(
id_from_content("## `--passes`: add more rustdoc passes"),
"--passes-add-more-rustdoc-passes"
2018-07-24 01:45:01 +08:00
);
assert_eq!(
id_from_content("## 中文標題 CJK title"),
"中文標題-cjk-title"
);
assert_eq!(id_from_content("## Über"), "Über");
}
}
mod html_munging {
use super::super::{normalize_id, unique_id_from_content};
#[test]
fn it_normalizes_ids() {
2018-07-24 01:45:01 +08:00
assert_eq!(
normalize_id("`--passes`: add more rustdoc passes"),
"--passes-add-more-rustdoc-passes"
2018-07-24 01:45:01 +08:00
);
assert_eq!(
normalize_id("Method-call 🐙 expressions \u{1f47c}"),
"method-call--expressions-"
);
assert_eq!(normalize_id("_-_12345"), "_-_12345");
assert_eq!(normalize_id("12345"), "12345");
assert_eq!(normalize_id("中文"), "中文");
assert_eq!(normalize_id("にほんご"), "にほんご");
assert_eq!(normalize_id("한국어"), "한국어");
assert_eq!(normalize_id(""), "");
}
#[test]
fn it_generates_unique_ids_from_content() {
// Same id if not given shared state
assert_eq!(
unique_id_from_content("## 中文標題 CJK title", &mut Default::default()),
"中文標題-cjk-title"
);
assert_eq!(
unique_id_from_content("## 中文標題 CJK title", &mut Default::default()),
"中文標題-cjk-title"
);
// Different id if given shared state
let mut id_counter = Default::default();
assert_eq!(unique_id_from_content("## Über", &mut id_counter), "Über");
assert_eq!(
unique_id_from_content("## 中文標題 CJK title", &mut id_counter),
"中文標題-cjk-title"
);
assert_eq!(unique_id_from_content("## Über", &mut id_counter), "Über-1");
assert_eq!(unique_id_from_content("## Über", &mut id_counter), "Über-2");
}
}
#[test]
fn escaped_brackets() {
assert_eq!(bracket_escape(""), "");
assert_eq!(bracket_escape("<"), "&lt;");
assert_eq!(bracket_escape(">"), "&gt;");
assert_eq!(bracket_escape("<>"), "&lt;&gt;");
assert_eq!(bracket_escape("<test>"), "&lt;test&gt;");
assert_eq!(bracket_escape("a<test>b"), "a&lt;test&gt;b");
}
}