Add sitemap generation support to HTML renderer

Fixes #1491
This commit is contained in:
ISSOtm 2021-07-30 18:29:48 +02:00
parent 2213312938
commit 333873a1ac
5 changed files with 132 additions and 0 deletions

1
Cargo.lock generated
View File

@ -861,6 +861,7 @@ dependencies = [
"tokio", "tokio",
"toml", "toml",
"topological-sort", "topological-sort",
"url",
"walkdir", "walkdir",
"warp", "warp",
] ]

View File

@ -34,6 +34,7 @@ shlex = "1"
tempfile = "3.0" tempfile = "3.0"
toml = "0.5.1" toml = "0.5.1"
topological-sort = "0.1.0" topological-sort = "0.1.0"
url = "2.2.2"
# Watch feature # Watch feature
notify = { version = "4.0", optional = true } notify = { version = "4.0", optional = true }

View File

@ -109,6 +109,7 @@ edit-url-template = "https://github.com/rust-lang/mdBook/edit/master/guide/{path
site-url = "/example-book/" site-url = "/example-book/"
cname = "myproject.rs" cname = "myproject.rs"
input-404 = "not-found.md" input-404 = "not-found.md"
sitemap = true
``` ```
The following configuration options are available: The following configuration options are available:
@ -162,6 +163,8 @@ The following configuration options are available:
This string will be written to a file named CNAME in the root of your site, as This string will be written to a file named CNAME in the root of your site, as
required by GitHub Pages (see [*Managing a custom domain for your GitHub Pages required by GitHub Pages (see [*Managing a custom domain for your GitHub Pages
site*][custom domain]). site*][custom domain]).
- **sitemap:** True to enable sitemap generation. In this case, `site-url` must be set,
and be an absolute URL.
[custom domain]: https://docs.github.com/en/github/working-with-github-pages/managing-a-custom-domain-for-your-github-pages-site [custom domain]: https://docs.github.com/en/github/working-with-github-pages/managing-a-custom-domain-for-your-github-pages-site

View File

@ -544,6 +544,9 @@ pub struct HtmlConfig {
/// The mapping from old pages to new pages/URLs to use when generating /// The mapping from old pages to new pages/URLs to use when generating
/// redirects. /// redirects.
pub redirect: HashMap<String, String>, pub redirect: HashMap<String, String>,
/// Whether to enable sitemap generation. If this is set, `site_url` must be an
/// absolute URL.
pub sitemap: bool,
} }
impl Default for HtmlConfig { impl Default for HtmlConfig {
@ -571,6 +574,7 @@ impl Default for HtmlConfig {
cname: None, cname: None,
livereload_url: None, livereload_url: None,
redirect: HashMap::new(), redirect: HashMap::new(),
sitemap: false,
} }
} }
} }

View File

@ -5,11 +5,13 @@ use crate::renderer::html_handlebars::helpers;
use crate::renderer::{RenderContext, Renderer}; use crate::renderer::{RenderContext, Renderer};
use crate::theme::{self, playground_editor, Theme}; use crate::theme::{self, playground_editor, Theme};
use crate::utils; use crate::utils;
use url::Url;
use std::borrow::Cow; use std::borrow::Cow;
use std::collections::BTreeMap; use std::collections::BTreeMap;
use std::collections::HashMap; use std::collections::HashMap;
use std::fs::{self, File}; use std::fs::{self, File};
use std::io::{BufWriter, Write};
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use crate::utils::fs::get_404_output_file; use crate::utils::fs::get_404_output_file;
@ -431,6 +433,119 @@ impl HtmlHandlebars {
Ok(()) Ok(())
} }
fn generate_sitemap<'a>(
&self,
destination: &Path,
site_url: &str,
items: impl Iterator<Item = &'a BookItem>,
) -> Result<()> {
if destination.exists() {
// sanity check to avoid accidentally overwriting a real file.
let msg = format!(
"Not generating \"{}\" because it already exists. Are you sure you want to generate a sitemap?",
destination.display(),
);
return Err(Error::msg(msg));
}
let mut site_url = Url::parse(site_url).with_context(|| {
format!(
"output.html.site-url (\"{}\") is not a valid absolute URL",
site_url
)
})?;
// The URL must end with a slash if it doesn't already, otherwise it isn't considered a
// directory for the purpose of joining!
if !site_url.path().ends_with('/') {
site_url.set_path(&format!("{}/", site_url.path()));
}
let sitemap = BufWriter::new(
File::create(&destination).with_context(|| "Failed to create sitemap file")?,
);
self.write_sitemap(sitemap, &site_url, items)
.with_context(|| "Error writing to sitemap file")
}
fn write_sitemap<'a>(
&self,
mut sitemap: impl Write,
site_url: &Url,
items: impl Iterator<Item = &'a BookItem>,
) -> Result<()> {
writeln!(sitemap, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>")?;
writeln!(
sitemap,
"<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">",
)?;
for (i, path) in items
.filter_map(|item| match item {
BookItem::Chapter(ch) if !ch.is_draft_chapter() => Some(ch.path.as_ref().unwrap()),
_ => None,
})
.enumerate()
{
// No joke, this is in the spec
if i == 50_000 {
warn!("Sitemaps must not provide more than 50,000 URLs; consider using an sitemap index instead");
}
let path = path.with_extension("html");
let path = path
.to_str()
.expect("Path should be valid UTF-8 from prior processing");
let url = site_url
.join(path)
.with_context(|| format!("Failed to join {} with site_url", path))?;
writeln!(sitemap, "\t<url>")?;
writeln!(sitemap, "\t\t<loc>{}</loc>", xml_escapes(&url.to_string()))?;
// TODO: lastmod from src file modification time?
writeln!(sitemap, "\t</url>")?;
}
writeln!(sitemap, "</urlset>")?;
// TODO: check that the fils isn't moer than 50 MiB (that's what the spec says)
sitemap.flush()?;
Ok(())
}
}
fn xml_escapes(s: &str) -> Cow<'_, str> {
let mut to_escape = s
.match_indices(|ref c| ['&', '\'', '"', '>', '<'].contains(c))
.peekable();
if to_escape.peek().is_none() {
return Cow::Borrowed(s);
}
// This is under-shooting it, but better than a wild guess
let mut escaped = String::with_capacity(s.len());
let mut n = 0;
for (i, c) in to_escape {
// Push everything before this match...
escaped.push_str(&s[n..i]);
// ...and start next "as-is push" from the character after this one.
// (`+ 1` OK because they are all ASCII chars)
n = i + 1;
escaped.push_str(match c {
"&" => "&amp;",
"'" => "&apos;",
"\"" => "&quot;",
">" => "&gt;",
"<" => "&lt;",
_ => unreachable!(),
});
}
// Push the rest, too
escaped.push_str(&s[n..]);
Cow::Owned(escaped)
} }
// TODO(mattico): Remove some time after the 0.1.8 release // TODO(mattico): Remove some time after the 0.1.8 release
@ -571,6 +686,14 @@ impl Renderer for HtmlHandlebars {
self.emit_redirects(&ctx.destination, &handlebars, &html_config.redirect) self.emit_redirects(&ctx.destination, &handlebars, &html_config.redirect)
.context("Unable to emit redirects")?; .context("Unable to emit redirects")?;
if html_config.sitemap {
let site_url = html_config
.site_url
.as_ref()
.ok_or_else(|| Error::msg("site-url must be set to generate a tilemap"))?;
self.generate_sitemap(&destination.join("sitemap.xml"), site_url, book.iter())?;
}
// Copy all remaining files, avoid a recursive copy from/to the book build dir // Copy all remaining files, avoid a recursive copy from/to the book build dir
utils::fs::copy_files_except_ext(&src_dir, destination, true, Some(&build_dir), &["md"])?; utils::fs::copy_files_except_ext(&src_dir, destination, true, Some(&build_dir), &["md"])?;