diff --git a/Cargo.lock b/Cargo.lock index ffec902d..1b51dd59 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -861,6 +861,7 @@ dependencies = [ "tokio", "toml", "topological-sort", + "url", "walkdir", "warp", ] diff --git a/Cargo.toml b/Cargo.toml index 61b23e1d..cf884517 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -34,6 +34,7 @@ shlex = "1" tempfile = "3.0" toml = "0.5.1" topological-sort = "0.1.0" +url = "2.2.2" # Watch feature notify = { version = "4.0", optional = true } diff --git a/guide/src/format/configuration/renderers.md b/guide/src/format/configuration/renderers.md index 1aa1eef9..c3c78441 100644 --- a/guide/src/format/configuration/renderers.md +++ b/guide/src/format/configuration/renderers.md @@ -109,6 +109,7 @@ edit-url-template = "https://github.com/rust-lang/mdBook/edit/master/guide/{path site-url = "/example-book/" cname = "myproject.rs" input-404 = "not-found.md" +sitemap = true ``` The following configuration options are available: @@ -162,6 +163,8 @@ The following configuration options are available: This string will be written to a file named CNAME in the root of your site, as required by GitHub Pages (see [*Managing a custom domain for your GitHub Pages site*][custom domain]). +- **sitemap:** True to enable sitemap generation. In this case, `site-url` must be set, + and be an absolute URL. [custom domain]: https://docs.github.com/en/github/working-with-github-pages/managing-a-custom-domain-for-your-github-pages-site diff --git a/src/config.rs b/src/config.rs index daeccbd0..4e2d483f 100644 --- a/src/config.rs +++ b/src/config.rs @@ -544,6 +544,9 @@ pub struct HtmlConfig { /// The mapping from old pages to new pages/URLs to use when generating /// redirects. pub redirect: HashMap, + /// Whether to enable sitemap generation. If this is set, `site_url` must be an + /// absolute URL. + pub sitemap: bool, } impl Default for HtmlConfig { @@ -571,6 +574,7 @@ impl Default for HtmlConfig { cname: None, livereload_url: None, redirect: HashMap::new(), + sitemap: false, } } } diff --git a/src/renderer/html_handlebars/hbs_renderer.rs b/src/renderer/html_handlebars/hbs_renderer.rs index 69dc3124..7885e4d3 100644 --- a/src/renderer/html_handlebars/hbs_renderer.rs +++ b/src/renderer/html_handlebars/hbs_renderer.rs @@ -5,11 +5,13 @@ use crate::renderer::html_handlebars::helpers; use crate::renderer::{RenderContext, Renderer}; use crate::theme::{self, playground_editor, Theme}; use crate::utils; +use url::Url; use std::borrow::Cow; use std::collections::BTreeMap; use std::collections::HashMap; use std::fs::{self, File}; +use std::io::{BufWriter, Write}; use std::path::{Path, PathBuf}; use crate::utils::fs::get_404_output_file; @@ -431,6 +433,119 @@ impl HtmlHandlebars { Ok(()) } + + fn generate_sitemap<'a>( + &self, + destination: &Path, + site_url: &str, + items: impl Iterator, + ) -> Result<()> { + if destination.exists() { + // sanity check to avoid accidentally overwriting a real file. + let msg = format!( + "Not generating \"{}\" because it already exists. Are you sure you want to generate a sitemap?", + destination.display(), + ); + return Err(Error::msg(msg)); + } + + let mut site_url = Url::parse(site_url).with_context(|| { + format!( + "output.html.site-url (\"{}\") is not a valid absolute URL", + site_url + ) + })?; + // The URL must end with a slash if it doesn't already, otherwise it isn't considered a + // directory for the purpose of joining! + if !site_url.path().ends_with('/') { + site_url.set_path(&format!("{}/", site_url.path())); + } + + let sitemap = BufWriter::new( + File::create(&destination).with_context(|| "Failed to create sitemap file")?, + ); + self.write_sitemap(sitemap, &site_url, items) + .with_context(|| "Error writing to sitemap file") + } + + fn write_sitemap<'a>( + &self, + mut sitemap: impl Write, + site_url: &Url, + items: impl Iterator, + ) -> Result<()> { + writeln!(sitemap, "")?; + writeln!( + sitemap, + "", + )?; + + for (i, path) in items + .filter_map(|item| match item { + BookItem::Chapter(ch) if !ch.is_draft_chapter() => Some(ch.path.as_ref().unwrap()), + _ => None, + }) + .enumerate() + { + // No joke, this is in the spec + if i == 50_000 { + warn!("Sitemaps must not provide more than 50,000 URLs; consider using an sitemap index instead"); + } + + let path = path.with_extension("html"); + let path = path + .to_str() + .expect("Path should be valid UTF-8 from prior processing"); + let url = site_url + .join(path) + .with_context(|| format!("Failed to join {} with site_url", path))?; + + writeln!(sitemap, "\t")?; + writeln!(sitemap, "\t\t{}", xml_escapes(&url.to_string()))?; + // TODO: lastmod from src file modification time? + writeln!(sitemap, "\t")?; + } + + writeln!(sitemap, "")?; + + // TODO: check that the fils isn't moer than 50 MiB (that's what the spec says) + + sitemap.flush()?; + Ok(()) + } +} + +fn xml_escapes(s: &str) -> Cow<'_, str> { + let mut to_escape = s + .match_indices(|ref c| ['&', '\'', '"', '>', '<'].contains(c)) + .peekable(); + if to_escape.peek().is_none() { + return Cow::Borrowed(s); + } + + // This is under-shooting it, but better than a wild guess + let mut escaped = String::with_capacity(s.len()); + let mut n = 0; + for (i, c) in to_escape { + // Push everything before this match... + escaped.push_str(&s[n..i]); + // ...and start next "as-is push" from the character after this one. + // (`+ 1` OK because they are all ASCII chars) + n = i + 1; + + escaped.push_str(match c { + "&" => "&", + "'" => "'", + "\"" => """, + ">" => ">", + "<" => "<", + _ => unreachable!(), + }); + } + // Push the rest, too + escaped.push_str(&s[n..]); + + Cow::Owned(escaped) } // TODO(mattico): Remove some time after the 0.1.8 release @@ -571,6 +686,14 @@ impl Renderer for HtmlHandlebars { self.emit_redirects(&ctx.destination, &handlebars, &html_config.redirect) .context("Unable to emit redirects")?; + if html_config.sitemap { + let site_url = html_config + .site_url + .as_ref() + .ok_or_else(|| Error::msg("site-url must be set to generate a tilemap"))?; + self.generate_sitemap(&destination.join("sitemap.xml"), site_url, book.iter())?; + } + // Copy all remaining files, avoid a recursive copy from/to the book build dir utils::fs::copy_files_except_ext(&src_dir, destination, true, Some(&build_dir), &["md"])?;