Tried to make summary parsing easier to understand and implement by writing out the EBNF grammar

2017-06-25 15:53:01 +08:00 · 2017-06-25 15:53:01 +08:00 · 588b444f06
parent eb839e4298
commit 588b444f06
2 changed files with 151 additions and 6 deletions
--- a/src/loader/mod.rs
+++ b/src/loader/mod.rs
@ -9,7 +9,7 @@ use std::io::Read;

 mod summary;

-pub use self::summary::Summary;
+pub use self::summary::{Summary, parse_summary};


 /// The object in charge of parsing the source directory into a usable
--- a/src/loader/summary.rs
+++ b/src/loader/summary.rs
@ -1,18 +1,141 @@
 use std::error::Error;
 use std::fmt::{self, Formatter, Display};
 use std::ops::{Deref, DerefMut};
-use pulldown_cmark;
+use pulldown_cmark::{self, Event, Tag};


+/// Parse the text from a `SUMMARY.md` file into a sort of "recipe" to be
+/// used when loading a book from disk.
+///
+/// # Summary Format
+///
+/// **Title:** It's common practice to begin with a title, generally 
+/// "# Summary". But it is not mandatory, the parser just ignores it. So you 
+/// can too if you feel like it.
+/// 
+/// **Prefix Chapter:** Before the main numbered chapters you can add a couple 
+/// of elements that will not be numbered. This is useful for forewords,
+/// introductions, etc. There are however some constraints. You can not nest
+/// prefix chapters, they should all be on the root level. And you can not add
+/// prefix chapters once you have added numbered chapters.
+/// 
+/// ```markdown
+/// [Title of prefix element](relative/path/to/markdown.md)
+/// ```
+/// 
+/// **Numbered Chapter:** Numbered chapters are the main content of the book, they
+/// will be numbered and can be nested, resulting in a nice hierarchy (chapters,
+/// sub-chapters, etc.)
+/// 
+/// ```markdown
+/// - [Title of the Chapter](relative/path/to/markdown.md)
+/// ```
+/// 
+/// You can either use - or * to indicate a numbered chapter.
+/// 
+/// **Suffix Chapter:** After the numbered chapters you can add a couple of
+/// non-numbered chapters. They are the same as prefix chapters but come after
+/// the numbered chapters instead of before.
+/// 
+/// All other elements are unsupported and will be ignored at best or result in
+/// an error.
+pub fn parse_summary(summary: &str) -> Result<Summary, Box<Error>> {
+    let parser = SummaryParser::new(summary);
+    parser.parse()    
+}
+
 /// The parsed `SUMMARY.md`, specifying how the book should be laid out.
+#[derive(Debug, Clone, Default, PartialEq)]
 pub struct Summary {
    title: Option<String>,
 }

-/// Parse the text from a `SUMMARY.md` file into a sort of "recipe" to be
-/// used when loading a book from disk.
-pub fn parse_summary(summary: &str) -> Result<Summary, Box<Error>> {
-    unimplemented!()
+/// A stateful parser for parsing a `SUMMARY.md` file.
+///
+/// # Grammar
+/// 
+/// The `SUMMARY.md` file has a grammar which looks something like this:
+///
+/// ```text
+/// summary           ::= title prefix_chapters numbered_chapters suffix_chapters
+/// title             ::= "# " TEXT
+///                     | EPSILON
+/// prefix_chapters   ::= item*
+/// suffix_chapters   ::= item*
+/// numbered_chapters ::= dotted_item+
+/// dotted_item       ::= INDENT* DOT_POINT item
+/// item              ::= link 
+///                     | separator
+/// separator         ::= "---"
+/// link              ::= "[" TEXT "]" "(" TEXT ")"
+/// DOT_POINT         ::= "-"
+///                     | "*"
+/// ```
+/// 
+/// > **Note:** the `TEXT` terminal is "normal" text, and should (roughly) 
+/// > match the following regex: "[^<>\n[]]+".
+struct SummaryParser<'a> {
+    stream: pulldown_cmark::Parser<'a>,
+    summary: Summary,
+}
+
+impl<'a> SummaryParser<'a> 
+{
+    fn new(text: &str) -> SummaryParser {
+        let pulldown_parser = pulldown_cmark::Parser::new(text);
+        let intermediate_summary = Summary::default();
+
+        SummaryParser {
+            stream: pulldown_parser,
+            summary: intermediate_summary,
+        }
+    }
+
+    fn parse(mut self) -> Result<Summary, Box<Error>> {
+        self.summary.title = self.parse_title();
+
+        Ok(self.summary)        
+    }
+
+    fn parse_title(&mut self) -> Option<String> {
+        if let Some(Event::Start(Tag::Header(1))) = self.stream.next() {
+            debug!("[*] Found a h1 in the SUMMARY");
+            
+            let mut tags = Vec::new();
+
+            loop {
+                let next_event = self.stream.next();
+                match next_event {
+                    Some(Event::End(Tag::Header(1))) => break,
+                    Some(other) => tags.push(other),
+                    None => {
+                        // If we ever get here then changes are pulldown_cmark 
+                        // is seriously broken. It means there's an opening 
+                        // <h1> tag but not a closing one. It also means 
+                        // we've consumed the entire stream of events, so
+                        // chances are any parsing after this will just hit
+                        // EOF and end early :(
+                        warn!("[*] No closing <h1> tag in the SUMMARY.md file");
+                        break;
+                    }
+                }
+            }
+
+            // TODO: How do we deal with headings like "# My **awesome** summary"?
+            // for now, I'm just going to scan through and concatenate the 
+            // Event::Text tags, skipping any styling.
+            let title: String = tags.into_iter()
+                .filter_map(|t| match t {
+                    Event::Text(text) => Some(text),
+                    _ => None,
+                })
+                .collect();
+
+            Some(title)
+        } else {
+            None
+        }
+    }
 }

 /// A section number like "1.2.3", basically just a newtype'd `Vec<u32>`.
@ -61,4 +184,26 @@ mod tests {
            assert_eq!(string_repr, should_be);
        }
    }
+
+    #[test]
+    fn parse_initial_title() {
+        let src = "# Summary";
+        let should_be = String::from("Summary");
+
+        let mut parser = SummaryParser::new(src);
+        let got = parser.parse_title().unwrap();
+
+        assert_eq!(got, should_be);
+    }
+
+    #[test]
+    fn parse_title_with_styling() {
+        let src = "# My **Awesome** Summary";
+        let should_be = String::from("My Awesome Summary");
+
+        let mut parser = SummaryParser::new(src);
+        let got = parser.parse_title().unwrap();
+
+        assert_eq!(got, should_be);
+    }
 }