Make print page (print.html) links link to anchors on the print page

Let all the anchors id on the print page to have a path id prefix to help locate. e.g. bar/foo.md#abc -> #bar-foo-abc Also append a dummy div to the start of the original page to make sure that original page links without an anchor can also be located. Fix to remove all the `./` in the normalized path id so that for "./foo/bar.html#abc" we still get "#foo-bar-abc" Add support for redirect link anchors in print page so that anchors can also be redirected, also handle URL redirect links on print page Handle all the elements id to add a path prefix, also make path id to all be the lower case Fix for print page footnote links by adding the path id prefix Signed-off-by: Hollow Man <hollowman@opensuse.org>
2022-04-15 14:43:20 +08:00 · 2022-04-15 14:43:20 +08:00 · 5b6b5e2605
parent 9877a68ab1
commit 5b6b5e2605
3 changed files with 357 additions and 61 deletions
--- a/src/renderer/html_handlebars/hbs_renderer.rs
+++ b/src/renderer/html_handlebars/hbs_renderer.rs
@ -56,10 +56,11 @@ impl HtmlHandlebars {

        let content = utils::render_markdown(&ch.content, ctx.html_config.smart_punctuation());

-        let fixed_content = utils::render_markdown_with_path(
+        let printed_item = utils::render_markdown_with_path_and_redirects(
            &ch.content,
            ctx.html_config.smart_punctuation(),
            Some(path),
+            &ctx.html_config.redirect,
        );
        if !ctx.is_index && ctx.html_config.print.page_break {
            // Add page break between chapters
@ -68,7 +69,25 @@ impl HtmlHandlebars {
            print_content
                .push_str(r#"<div style="break-before: page; page-break-before: always;"></div>"#);
        }
-        print_content.push_str(&fixed_content);
+        let print_page_id = {
+            let mut base = path.display().to_string();
+            if base.ends_with(".md") {
+                base.truncate(base.len() - 3);
+            }
+            &base
+                .replace("/", "-")
+                .replace("\\", "-")
+                .to_ascii_lowercase()
+        };
+
+        // We have to build header links in advance so that we can know the ranges
+        // for the headers in one page.
+        // Insert a dummy div to make sure that we can locate the specific page.
+        print_content.push_str(&(format!(r#"<div id="{print_page_id}"></div>"#)));
+        print_content.push_str(&build_header_links(
+            &build_print_element_id(&printed_item, &print_page_id),
+            Some(print_page_id),
+        ));

        // Update the context with data for this file
        let ctx_path = path
@ -214,7 +233,23 @@ impl HtmlHandlebars {
        code_config: &Code,
        edition: Option<RustEdition>,
    ) -> String {
-        let rendered = build_header_links(&rendered);
+        let rendered = build_header_links(&rendered, None);
+        let rendered = self.post_process_common(rendered, &playground_config, code_config, edition);
+
+        rendered
+    }
+
+    /// Applies some post-processing to the HTML to apply some adjustments.
+    ///
+    /// This common function is used for both normal chapters (via
+    /// `post_process`) and the combined print page.
+    fn post_process_common(
+        &self,
+        rendered: String,
+        playground_config: &Playground,
+        code_config: &Code,
+        edition: Option<RustEdition>,
+    ) -> String {
        let rendered = fix_code_blocks(&rendered);
        let rendered = add_playground_pre(&rendered, playground_config, edition);
        let rendered = hide_lines(&rendered, code_config);
@ -572,7 +607,7 @@ impl Renderer for HtmlHandlebars {
            debug!("Render template");
            let rendered = handlebars.render("index", &data)?;

-            let rendered = self.post_process(
+            let rendered = self.post_process_common(
                rendered,
                &html_config.playground,
                &html_config.code,
@ -783,9 +818,34 @@ fn make_data(
    Ok(data)
 }

+/// Go through the rendered print page HTML,
+/// add path id prefix to all the elements id as well as footnote links.
+fn build_print_element_id(html: &str, print_page_id: &str) -> String {
+    static ALL_ID: Lazy<Regex> = Lazy::new(|| Regex::new(r#"(<[^>]*?id=")([^"]+?)""#).unwrap());
+    static FOOTNOTE_ID: Lazy<Regex> = Lazy::new(|| {
+        Regex::new(
+            r##"(<sup [^>]*?class="footnote-reference"[^>]*?>[^<]*?<a [^>]*?href="#)([^"]+?)""##,
+        )
+        .unwrap()
+    });
+
+    let temp_html = ALL_ID.replace_all(html, |caps: &Captures<'_>| {
+        format!("{}{}-{}\"", &caps[1], print_page_id, &caps[2])
+    });
+
+    FOOTNOTE_ID
+        .replace_all(&temp_html, |caps: &Captures<'_>| {
+            format!("{}{}-{}\"", &caps[1], print_page_id, &caps[2])
+        })
+        .into_owned()
+}
+
 /// Goes through the rendered HTML, making sure all header tags have
 /// an anchor respectively so people can link to sections directly.
-fn build_header_links(html: &str) -> String {
+///
+/// `print_page_id` should be set to the print page ID prefix when adjusting the
+/// print page.
+fn build_header_links(html: &str, print_page_id: Option<&str>) -> String {
    static BUILD_HEADER_LINKS: Lazy<Regex> = Lazy::new(|| {
        Regex::new(r#"<h(\d)(?: id="([^"]+)")?(?: class="([^"]+)")?>(.*?)</h\d>"#).unwrap()
    });
@ -814,6 +874,7 @@ fn build_header_links(html: &str) -> String {
                caps.get(2).map(|x| x.as_str().to_string()),
                caps.get(3).map(|x| x.as_str().to_string()),
                &mut id_counter,
+                print_page_id,
            )
        })
        .into_owned()
@ -821,14 +882,26 @@ fn build_header_links(html: &str) -> String {

 /// Insert a sinle link into a header, making sure each link gets its own
 /// unique ID by appending an auto-incremented number (if necessary).
+///
+/// For `print.html`, we will add a path id prefix.
 fn insert_link_into_header(
    level: usize,
    content: &str,
    id: Option<String>,
    classes: Option<String>,
    id_counter: &mut HashMap<String, usize>,
+    print_page_id: Option<&str>,
 ) -> String {
-    let id = id.unwrap_or_else(|| utils::unique_id_from_content(content, id_counter));
+    let id = if let Some(print_page_id) = print_page_id {
+        let content_id = {
+            #[allow(deprecated)]
+            utils::id_from_content(content)
+        };
+        let with_prefix = format!("{} {}", print_page_id, content_id);
+        id.unwrap_or_else(|| utils::unique_id_from_content(&with_prefix, id_counter))
+    } else {
+        id.unwrap_or_else(|| utils::unique_id_from_content(content, id_counter))
+    };
    let classes = classes
        .map(|s| format!(" class=\"{s}\""))
        .unwrap_or_default();
@ -1117,7 +1190,7 @@ mod tests {
        ];

        for (src, should_be) in inputs {
-            let got = build_header_links(src);
+            let got = build_header_links(src, None);
            assert_eq!(got, should_be);
        }
    }
--- a/src/utils/mod.rs
+++ b/src/utils/mod.rs
@ -6,13 +6,13 @@ pub(crate) mod toml_ext;
 use crate::errors::Error;
 use log::error;
 use once_cell::sync::Lazy;
-use pulldown_cmark::{html, CodeBlockKind, CowStr, Event, Options, Parser, Tag, TagEnd};
+use pulldown_cmark::{html, CodeBlockKind, CowStr, Event, LinkType, Options, Parser, Tag, TagEnd};
 use regex::Regex;

 use std::borrow::Cow;
 use std::collections::HashMap;
 use std::fmt::Write;
-use std::path::Path;
+use std::path::{Component, Path, PathBuf};

 pub use self::string::{
    take_anchored_lines, take_lines, take_rustdoc_include_anchored_lines,
@ -83,63 +83,232 @@ pub fn unique_id_from_content(content: &str, id_counter: &mut HashMap<String, us
    unique_id
 }

+/// Improve the path to try remove and solve .. token,
+/// This assumes that `a/b/../c` is `a/c`.
+///
+/// This function ensures a given path ending with '/' will also
+/// end with '/' after normalization.
+/// https://stackoverflow.com/a/68233480
+fn normalize_path<P: AsRef<Path>>(path: P) -> String {
+    let ends_with_slash = path.as_ref().to_str().map_or(false, |s| s.ends_with('/'));
+    let mut normalized = PathBuf::new();
+    for component in path.as_ref().components() {
+        match &component {
+            Component::ParentDir => {
+                if !normalized.pop() {
+                    normalized.push(component);
+                }
+            }
+            Component::CurDir => {}
+            _ => {
+                normalized.push(component);
+            }
+        }
+    }
+    if ends_with_slash {
+        normalized.push("");
+    }
+    normalized.to_str().unwrap().replace("\\", "/").to_string()
+}
+
+/// Converts a relative URL path to a reference ID for the print page.
+fn normalize_print_page_id(mut path: String) -> String {
+    path = path
+        .replace("/", "-")
+        .replace(".html#", "-")
+        .replace("#", "-")
+        .to_ascii_lowercase();
+    if path.ends_with(".html") {
+        path.truncate(path.len() - 5);
+    }
+    path
+}
+
 /// Fix links to the correct location.
 ///
 /// This adjusts links, such as turning `.md` extensions to `.html`.
 ///
-/// `path` is the path to the page being rendered relative to the root of the
-/// book. This is used for the `print.html` page so that links on the print
-/// page go to the original location. Normal page rendering sets `path` to
-/// None. Ideally, print page links would link to anchors on the print page,
-/// but that is very difficult.
-fn adjust_links<'a>(event: Event<'a>, path: Option<&Path>) -> Event<'a> {
+/// See [`render_markdown_with_path_and_redirects`] for a description of
+/// `path` and `redirects`.
+fn adjust_links<'a>(
+    event: Event<'a>,
+    path: Option<&Path>,
+    redirects: &HashMap<String, String>,
+) -> Event<'a> {
    static SCHEME_LINK: Lazy<Regex> = Lazy::new(|| Regex::new(r"^[a-z][a-z0-9+.-]*:").unwrap());
-    static MD_LINK: Lazy<Regex> =
-        Lazy::new(|| Regex::new(r"(?P<link>.*)\.md(?P<anchor>#.*)?").unwrap());
+    static HTML_MD_LINK: Lazy<Regex> =
+        Lazy::new(|| Regex::new(r"(?P<link>.*)\.(html|md)(?P<anchor>#.*)?").unwrap());

-    fn fix<'a>(dest: CowStr<'a>, path: Option<&Path>) -> CowStr<'a> {
-        if dest.starts_with('#') {
-            // Fragment-only link.
-            if let Some(path) = path {
-                let mut base = path.display().to_string();
-                if base.ends_with(".md") {
-                    base.replace_range(base.len() - 3.., ".html");
-                }
-                return format!("{}{}", base, dest).into();
-            } else {
-                return dest;
+    fn add_base(path: Option<&Path>) -> String {
+        let mut fixed_link = String::new();
+        if let Some(path) = path {
+            let base = path
+                .parent()
+                .expect("path can't be empty")
+                .to_str()
+                .expect("utf-8 paths only");
+            if !base.is_empty() {
+                write!(fixed_link, "{}/", base).unwrap();
            }
        }
-        // Don't modify links with schemes like `https`.
-        if !SCHEME_LINK.is_match(&dest) {
-            // This is a relative link, adjust it as necessary.
-            let mut fixed_link = String::new();
-            if let Some(path) = path {
-                let base = path
+        fixed_link.to_string()
+    }
+
+    fn fix_print_page_link<'a>(
+        mut normalized_path: String,
+        redirects: &HashMap<String, String>,
+    ) -> CowStr<'a> {
+        // Fix redirect links
+        let (path_no_fragment, fragment) = match normalized_path.split_once('#') {
+            Some((a, b)) => (a, Some(b)),
+            None => (normalized_path.as_str(), None),
+        };
+        for (original, redirect) in redirects {
+            if !normalize_path(original.trim_start_matches('/'))
+                .eq_ignore_ascii_case(&normalized_path)
+                && !normalize_path(original.trim_start_matches('/'))
+                    .eq_ignore_ascii_case(&path_no_fragment)
+            {
+                continue;
+            }
+
+            let mut unnormalized_path = String::new();
+            if SCHEME_LINK.is_match(&redirect) {
+                unnormalized_path = redirect.to_string();
+            } else {
+                let base = PathBuf::from(path_no_fragment)
                    .parent()
                    .expect("path can't be empty")
                    .to_str()
-                    .expect("utf-8 paths only");
-                if !base.is_empty() {
-                    write!(fixed_link, "{}/", base).unwrap();
+                    .expect("utf-8 paths only")
+                    .to_owned();
+
+                let normalized_base = normalize_path(base).trim_matches('/').to_owned();
+                if !normalized_base.is_empty() {
+                    write!(unnormalized_path, "{}/{}", normalized_base, redirect).unwrap();
+                } else {
+                    unnormalized_path = redirect.to_string().trim_start_matches('/').to_string();
                }
            }

-            if let Some(caps) = MD_LINK.captures(&dest) {
-                fixed_link.push_str(&caps["link"]);
-                fixed_link.push_str(".html");
-                if let Some(anchor) = caps.name("anchor") {
-                    fixed_link.push_str(anchor.as_str());
+            // original without anchors, need to append link anchors
+            if !original.contains("#") {
+                if let Some(fragment) = fragment {
+                    if !unnormalized_path.contains("#") {
+                        unnormalized_path.push('#');
+                    } else {
+                        unnormalized_path.push('-');
+                    }
+                    unnormalized_path.push_str(fragment);
                }
+            }
+
+            if SCHEME_LINK.is_match(&redirect) {
+                return CowStr::from(unnormalized_path);
            } else {
-                fixed_link.push_str(&dest);
-            };
-            return CowStr::from(fixed_link);
+                normalized_path = normalize_path(unnormalized_path);
+            }
+            break;
        }
-        dest
+
+        // Check again to make sure anchors are the html links inside the book.
+        if normalized_path.starts_with("../") || normalized_path.contains("/../") {
+            return CowStr::from(normalized_path);
+        }
+
+        let mut fixed_anchor_for_print = String::new();
+        fixed_anchor_for_print.push_str("#");
+        fixed_anchor_for_print.push_str(&normalize_print_page_id(normalized_path));
+        CowStr::from(fixed_anchor_for_print)
    }

-    fn fix_html<'a>(html: CowStr<'a>, path: Option<&Path>) -> CowStr<'a> {
+    /// Fix resource links like img to the correct location.
+    fn fix_resource_links<'a>(dest: CowStr<'a>, path: Option<&Path>) -> CowStr<'a> {
+        // Don't modify links with schemes like `https`.
+        if SCHEME_LINK.is_match(&dest) {
+            return dest;
+        }
+
+        // This is a relative link, adjust it as necessary.
+        let mut fixed_link = add_base(path);
+        fixed_link.push_str(&dest);
+        CowStr::from(fixed_link)
+    }
+
+    fn fix_a_links_with_type<'a>(
+        dest: CowStr<'a>,
+        path: Option<&Path>,
+        redirects: &HashMap<String, String>,
+        link_type: LinkType,
+    ) -> CowStr<'a> {
+        if link_type == LinkType::Email {
+            return dest;
+        }
+        fix_a_links(dest, path, redirects)
+    }
+
+    /// Adjust markdown file to correct point in the html file.
+    fn fix_a_links<'a>(
+        dest: CowStr<'a>,
+        path: Option<&Path>,
+        redirects: &HashMap<String, String>,
+    ) -> CowStr<'a> {
+        if dest.starts_with('#') {
+            // Fragment-only link.
+            return match path {
+                Some(path) => {
+                    let mut base = path.display().to_string();
+                    if base.ends_with(".md") {
+                        base.truncate(base.len() - 3);
+                    }
+                    format!(
+                        "#{}{}",
+                        normalize_print_page_id(normalize_path(base)),
+                        dest.replace("#", "-")
+                    )
+                    .into()
+                }
+                None => dest,
+            };
+        }
+
+        // Don't modify links with schemes like `https`.
+        if SCHEME_LINK.is_match(&dest) {
+            return dest;
+        }
+
+        // This is a relative link, adjust it as necessary.
+        let mut fixed_link = add_base(path);
+
+        if let Some(caps) = HTML_MD_LINK.captures(&dest) {
+            fixed_link.push_str(&caps["link"]);
+            fixed_link.push_str(".html");
+            if let Some(anchor) = caps.name("anchor") {
+                fixed_link.push_str(anchor.as_str());
+            }
+        } else {
+            fixed_link.push_str(&dest);
+        };
+
+        let normalized_path = normalize_path(&fixed_link);
+
+        // Judge if the html link is inside the book.
+        if !normalized_path.starts_with("../") && !normalized_path.contains("/../") {
+            // In `print.html`, print page links would all link to anchors on the print page.
+            return match path {
+                Some(_) => fix_print_page_link(normalized_path, redirects),
+                None => CowStr::from(fixed_link),
+            };
+        }
+        // In normal page rendering, links to anchors on another page.
+        CowStr::from(fixed_link)
+    }
+
+    fn fix_html<'a>(
+        html: CowStr<'a>,
+        path: Option<&Path>,
+        redirects: &HashMap<String, String>,
+    ) -> CowStr<'a> {
        // This is a terrible hack, but should be reasonably reliable. Nobody
        // should ever parse a tag with a regex. However, there isn't anything
        // in Rust that I know of that is suitable for handling partial html
@ -148,12 +317,45 @@ fn adjust_links<'a>(event: Event<'a>, path: Option<&Path>) -> Event<'a> {
        // There are dozens of HTML tags/attributes that contain paths, so
        // feel free to add more tags if desired; these are the only ones I
        // care about right now.
-        static HTML_LINK: Lazy<Regex> =
-            Lazy::new(|| Regex::new(r#"(<(?:a|img) [^>]*?(?:src|href)=")([^"]+?)""#).unwrap());
+        static A_LINK: Lazy<Regex> =
+            Lazy::new(|| Regex::new(r#"(<a [^>]*?href=")([^"]+?)""#).unwrap());
+        static A_NAME: Lazy<Regex> =
+            Lazy::new(|| Regex::new(r#"(<a [^>]*?name=")([^"]+?)""#).unwrap());
+        static IMG_LINK: Lazy<Regex> =
+            Lazy::new(|| Regex::new(r#"(<img [^>]*?src=")([^"]+?)""#).unwrap());

-        HTML_LINK
-            .replace_all(&html, |caps: &regex::Captures<'_>| {
-                let fixed = fix(caps[2].into(), path);
+        let img_link_fixed_html = IMG_LINK.replace_all(&html, |caps: &regex::Captures<'_>| {
+            let fixed = fix_resource_links(caps[2].into(), path);
+            format!("{}{}\"", &caps[1], fixed)
+        });
+
+        let a_name_fixed_html =
+            A_NAME.replace_all(&img_link_fixed_html, |caps: &regex::Captures<'_>| {
+                // This is a relative link, adjust it as necessary.
+                let origin_name = &caps[2].to_string();
+                format!(
+                    "{}{}\"",
+                    &caps[1],
+                    CowStr::from(match path {
+                        Some(path) => {
+                            let mut base = path.display().to_string();
+                            if base.ends_with(".md") {
+                                base.truncate(base.len() - 3);
+                            }
+                            format!(
+                                "{}-{}",
+                                normalize_print_page_id(normalize_path(base)),
+                                origin_name.to_string()
+                            )
+                        }
+                        None => origin_name.to_string(),
+                    })
+                )
+            });
+
+        A_LINK
+            .replace_all(&a_name_fixed_html, |caps: &regex::Captures<'_>| {
+                let fixed = fix_a_links(caps[2].into(), path, &redirects);
                format!("{}{}\"", &caps[1], fixed)
            })
            .into_owned()
@ -168,7 +370,7 @@ fn adjust_links<'a>(event: Event<'a>, path: Option<&Path>) -> Event<'a> {
            id,
        }) => Event::Start(Tag::Link {
            link_type,
-            dest_url: fix(dest_url, path),
+            dest_url: fix_a_links_with_type(dest_url, path, redirects, link_type),
            title,
            id,
        }),
@ -179,12 +381,12 @@ fn adjust_links<'a>(event: Event<'a>, path: Option<&Path>) -> Event<'a> {
            id,
        }) => Event::Start(Tag::Image {
            link_type,
-            dest_url: fix(dest_url, path),
+            dest_url: fix_resource_links(dest_url, path),
            title,
            id,
        }),
-        Event::Html(html) => Event::Html(fix_html(html, path)),
-        Event::InlineHtml(html) => Event::InlineHtml(fix_html(html, path)),
+        Event::Html(html) => Event::Html(fix_html(html, path, redirects)),
+        Event::InlineHtml(html) => Event::InlineHtml(fix_html(html, path, redirects)),
        _ => event,
    }
 }
@ -194,6 +396,15 @@ pub fn render_markdown(text: &str, smart_punctuation: bool) -> String {
    render_markdown_with_path(text, smart_punctuation, None)
 }

+/// Wrapper around for API compatibility.
+pub fn render_markdown_with_path(
+    text: &str,
+    smart_punctuation: bool,
+    path: Option<&Path>,
+) -> String {
+    render_markdown_with_path_and_redirects(text, smart_punctuation, path, &HashMap::new())
+}
+
 pub fn new_cmark_parser(text: &str, smart_punctuation: bool) -> Parser<'_> {
    let mut opts = Options::empty();
    opts.insert(Options::ENABLE_TABLES);
@ -207,16 +418,26 @@ pub fn new_cmark_parser(text: &str, smart_punctuation: bool) -> Parser<'_> {
    Parser::new_ext(text, opts)
 }

-pub fn render_markdown_with_path(
+/// Renders markdown to HTML.
+///
+/// `path` is the path to the page being rendered relative to the root of the
+/// book. This is used for the `print.html` page so that links on the print
+/// page go to the anchors that has a path id prefix. Normal page rendering
+/// sets `path` to None.
+///
+/// `redirects` is also only for the print page. It's for adjusting links to
+/// a redirected location to go to the correct spot on the `print.html` page.
+pub(crate) fn render_markdown_with_path_and_redirects(
    text: &str,
    smart_punctuation: bool,
    path: Option<&Path>,
+    redirects: &HashMap<String, String>,
 ) -> String {
    let mut s = String::with_capacity(text.len() * 3 / 2);
    let p = new_cmark_parser(text, smart_punctuation);
    let events = p
        .map(clean_codeblock_headers)
-        .map(|event| adjust_links(event, path))
+        .map(|event| adjust_links(event, path, &redirects))
        .flat_map(|event| {
            let (a, b) = wrap_tables(event);
            a.into_iter().chain(b)
--- a/tests/rendered_output.rs
+++ b/tests/rendered_output.rs
@ -126,12 +126,14 @@ fn check_correct_relative_links_in_print_page() {
    assert_contains_strings(
        first.join("print.html"),
        &[
-            r##"<a href="second/../first/nested.html">the first section</a>,"##,
+            r##"<a href="#first-nested">the first section</a>,"##,
            r##"<a href="second/../../std/foo/bar.html">outside</a>"##,
            r##"<img src="second/../images/picture.png" alt="Some image" />"##,
-            r##"<a href="second/nested.html#some-section">fragment link</a>"##,
-            r##"<a href="second/../first/markdown.html">HTML Link</a>"##,
+            r##"<a href="#second-nested-some-section">fragment link</a>"##,
+            r##"<a href="#first-markdown">HTML Link</a>"##,
            r##"<img src="second/../images/picture.png" alt="raw html">"##,
+            r##"<sup class="footnote-reference"><a href="#first-markdown-1">1</a></sup>"##,
+            r##"<sup class="footnote-reference"><a href="#first-markdown-word">2</a></sup>"##,
        ],
    );
 }