Tried to make summary parsing easier to understand and implement by writing out the EBNF grammar

This commit is contained in:
Michael Bryan 2017-06-25 15:53:01 +08:00
parent eb839e4298
commit 588b444f06
2 changed files with 151 additions and 6 deletions

View File

@ -9,7 +9,7 @@ use std::io::Read;
mod summary; mod summary;
pub use self::summary::Summary; pub use self::summary::{Summary, parse_summary};
/// The object in charge of parsing the source directory into a usable /// The object in charge of parsing the source directory into a usable

View File

@ -1,18 +1,141 @@
use std::error::Error; use std::error::Error;
use std::fmt::{self, Formatter, Display}; use std::fmt::{self, Formatter, Display};
use std::ops::{Deref, DerefMut}; use std::ops::{Deref, DerefMut};
use pulldown_cmark; use pulldown_cmark::{self, Event, Tag};
/// Parse the text from a `SUMMARY.md` file into a sort of "recipe" to be
/// used when loading a book from disk.
///
/// # Summary Format
///
/// **Title:** It's common practice to begin with a title, generally
/// "# Summary". But it is not mandatory, the parser just ignores it. So you
/// can too if you feel like it.
///
/// **Prefix Chapter:** Before the main numbered chapters you can add a couple
/// of elements that will not be numbered. This is useful for forewords,
/// introductions, etc. There are however some constraints. You can not nest
/// prefix chapters, they should all be on the root level. And you can not add
/// prefix chapters once you have added numbered chapters.
///
/// ```markdown
/// [Title of prefix element](relative/path/to/markdown.md)
/// ```
///
/// **Numbered Chapter:** Numbered chapters are the main content of the book, they
/// will be numbered and can be nested, resulting in a nice hierarchy (chapters,
/// sub-chapters, etc.)
///
/// ```markdown
/// - [Title of the Chapter](relative/path/to/markdown.md)
/// ```
///
/// You can either use - or * to indicate a numbered chapter.
///
/// **Suffix Chapter:** After the numbered chapters you can add a couple of
/// non-numbered chapters. They are the same as prefix chapters but come after
/// the numbered chapters instead of before.
///
/// All other elements are unsupported and will be ignored at best or result in
/// an error.
pub fn parse_summary(summary: &str) -> Result<Summary, Box<Error>> {
let parser = SummaryParser::new(summary);
parser.parse()
}
/// The parsed `SUMMARY.md`, specifying how the book should be laid out. /// The parsed `SUMMARY.md`, specifying how the book should be laid out.
#[derive(Debug, Clone, Default, PartialEq)]
pub struct Summary { pub struct Summary {
title: Option<String>, title: Option<String>,
} }
/// Parse the text from a `SUMMARY.md` file into a sort of "recipe" to be /// A stateful parser for parsing a `SUMMARY.md` file.
/// used when loading a book from disk. ///
pub fn parse_summary(summary: &str) -> Result<Summary, Box<Error>> { /// # Grammar
unimplemented!() ///
/// The `SUMMARY.md` file has a grammar which looks something like this:
///
/// ```text
/// summary ::= title prefix_chapters numbered_chapters suffix_chapters
/// title ::= "# " TEXT
/// | EPSILON
/// prefix_chapters ::= item*
/// suffix_chapters ::= item*
/// numbered_chapters ::= dotted_item+
/// dotted_item ::= INDENT* DOT_POINT item
/// item ::= link
/// | separator
/// separator ::= "---"
/// link ::= "[" TEXT "]" "(" TEXT ")"
/// DOT_POINT ::= "-"
/// | "*"
/// ```
///
/// > **Note:** the `TEXT` terminal is "normal" text, and should (roughly)
/// > match the following regex: "[^<>\n[]]+".
struct SummaryParser<'a> {
stream: pulldown_cmark::Parser<'a>,
summary: Summary,
}
impl<'a> SummaryParser<'a>
{
fn new(text: &str) -> SummaryParser {
let pulldown_parser = pulldown_cmark::Parser::new(text);
let intermediate_summary = Summary::default();
SummaryParser {
stream: pulldown_parser,
summary: intermediate_summary,
}
}
fn parse(mut self) -> Result<Summary, Box<Error>> {
self.summary.title = self.parse_title();
Ok(self.summary)
}
fn parse_title(&mut self) -> Option<String> {
if let Some(Event::Start(Tag::Header(1))) = self.stream.next() {
debug!("[*] Found a h1 in the SUMMARY");
let mut tags = Vec::new();
loop {
let next_event = self.stream.next();
match next_event {
Some(Event::End(Tag::Header(1))) => break,
Some(other) => tags.push(other),
None => {
// If we ever get here then changes are pulldown_cmark
// is seriously broken. It means there's an opening
// <h1> tag but not a closing one. It also means
// we've consumed the entire stream of events, so
// chances are any parsing after this will just hit
// EOF and end early :(
warn!("[*] No closing <h1> tag in the SUMMARY.md file");
break;
}
}
}
// TODO: How do we deal with headings like "# My **awesome** summary"?
// for now, I'm just going to scan through and concatenate the
// Event::Text tags, skipping any styling.
let title: String = tags.into_iter()
.filter_map(|t| match t {
Event::Text(text) => Some(text),
_ => None,
})
.collect();
Some(title)
} else {
None
}
}
} }
/// A section number like "1.2.3", basically just a newtype'd `Vec<u32>`. /// A section number like "1.2.3", basically just a newtype'd `Vec<u32>`.
@ -61,4 +184,26 @@ mod tests {
assert_eq!(string_repr, should_be); assert_eq!(string_repr, should_be);
} }
} }
#[test]
fn parse_initial_title() {
let src = "# Summary";
let should_be = String::from("Summary");
let mut parser = SummaryParser::new(src);
let got = parser.parse_title().unwrap();
assert_eq!(got, should_be);
}
#[test]
fn parse_title_with_styling() {
let src = "# My **Awesome** Summary";
let should_be = String::from("My Awesome Summary");
let mut parser = SummaryParser::new(src);
let got = parser.parse_title().unwrap();
assert_eq!(got, should_be);
}
} }