8 files changed, 509 insertions, 0 deletions
diff --git a/packages/gitbook-html/src/dom.js b/packages/gitbook-html/src/dom.js
new file mode 100644
index 0000000..9c5e070
--- /dev/null
+++ b/packages/gitbook-html/src/dom.js
@@ -0,0 +1,62 @@
+const cheerio = require('cheerio');
+
+/**
+ * Parse an HTML string and return its content.
+ * @param {String}
+ * @return {cheerio.DOM}
+ */
+function parse(html) {
+    const $ = cheerio.load(html);
+    const $el = $('html, body').first();
+
+    return $el.length > 0 ? $el : $;
+}
+
+/**
+ * Return main element for a DOM.
+ * @param {cheerio.DOM}
+ * @return {cheerio.Node}
+ */
+function root($) {
+    const $el = $('html, body, > div').first();
+    return $el.length > 0 ? $el : $.root();
+}
+
+/**
+ * Return text node of an element.
+ * @param {cheerio.Node}
+ * @return {String}
+ */
+function textNode($el) {
+    return $el.children.reduce(
+        (text, e) => {
+            if (e.type == 'text') text += e.data;
+            return text;
+        },
+        ''
+    );
+}
+
+/**
+ * Cleanup a DOM by removing all useless divs.
+ * @param {cheerio.Node}
+ * @param {cheerio.DOM}
+ * @return {cheerio.Node}
+ */
+function cleanup($el, $) {
+    $el.find('div').each(function() {
+        const $div = $(this);
+        cleanup($div, $);
+
+        $div.replaceWith($div.html());
+    });
+
+    return $el;
+}
+
+module.exports = {
+    parse,
+    textNode,
+    root,
+    cleanup
+};
diff --git a/packages/gitbook-html/src/glossary.js b/packages/gitbook-html/src/glossary.js
new file mode 100755
index 0000000..a4269fe
--- /dev/null
+++ b/packages/gitbook-html/src/glossary.js
@@ -0,0 +1,30 @@
+const dom = require('./dom');
+
+/**
+ * Parse an HTML content into a list of glossary entry.
+ *
+ * @param {String} html
+ * @return {Array} entries
+ */
+function parseGlossary(html) {
+    const $ = dom.parse(html);
+
+    const entries = [];
+
+    $('h2').each(function() {
+        const $heading = $(this);
+        const $next = $heading.next();
+        const $p =  $next.is('p') ? $next.first() : $next.find('p').first();
+
+        const entry = {};
+
+        entry.name = $heading.text();
+        entry.description = $p.text();
+
+        entries.push(entry);
+    });
+
+    return entries;
+}
+
+module.exports = parseGlossary;
diff --git a/packages/gitbook-html/src/index.js b/packages/gitbook-html/src/index.js
new file mode 100755
index 0000000..9d560f1
--- /dev/null
+++ b/packages/gitbook-html/src/index.js
@@ -0,0 +1,50 @@
+const ToText = require('./totext');
+
+const htmlParser = {
+    summary: require('./summary'),
+    glossary: require('./glossary'),
+    langs: require('./langs'),
+    readme: require('./readme'),
+    page: require('./page')
+};
+
+// Compose a function with a transform function for the first argument only
+function compose(toHTML, fn) {
+    return (...args) => {
+        args[0] = toHTML(args[0]);
+        return fn(...args);
+    };
+}
+
+/**
+ * Create a GitBook parser from an HTML converter.
+ * @param  {Object} toHTML
+ *         {Function} [toHTML.inline]
+ *         {Function} [toHTML.block]
+ * @param  {Object} toText
+ * @return {[type]}        [description]
+ */
+function createParser(toHTML, toText = {}) {
+    const parser = {
+        summary: compose(toHTML.block, htmlParser.summary),
+        glossary: compose(toHTML.block, htmlParser.glossary),
+        langs: compose(toHTML.block, htmlParser.langs),
+        readme: compose(toHTML.block, htmlParser.readme),
+        page: compose(toHTML.block, htmlParser.page),
+        inline: compose(toHTML.inline, htmlParser.page)
+    };
+
+    const _toText = new ToText(toText);
+
+    parser.summary.toText  = summary => _toText.summary(summary);
+    parser.langs.toText    = langs => _toText.langs(langs);
+    parser.glossary.toText = glossary => _toText.glossary(glossary);
+
+    return parser;
+}
+
+module.exports = createParser({
+    block:  html => html,
+    inline: html => html
+});
+module.exports.createParser = createParser;
diff --git a/packages/gitbook-html/src/langs.js b/packages/gitbook-html/src/langs.js
new file mode 100755
index 0000000..2c3523f
--- /dev/null
+++ b/packages/gitbook-html/src/langs.js
@@ -0,0 +1,17 @@
+const parseSummary = require('./summary');
+
+/**
+ * Parse an HTML content into a list of language.
+ * @param {String} html
+ * @return {Array}
+ */
+function parseLangs(content) {
+    const parts = parseSummary(content).parts;
+    if (parts.length > 0) {
+        return parts[0].articles;
+    }
+
+    return [];
+}
+
+module.exports = parseLangs;
diff --git a/packages/gitbook-html/src/page.js b/packages/gitbook-html/src/page.js
new file mode 100755
index 0000000..c4982b5
--- /dev/null
+++ b/packages/gitbook-html/src/page.js
@@ -0,0 +1,12 @@
+/**
+ * Parse content of a page.
+ * @param {String} html
+ * @return {Object}
+ */
+function parsePage(html) {
+    return {
+        content: html
+    };
+}
+
+module.exports = parsePage;
diff --git a/packages/gitbook-html/src/readme.js b/packages/gitbook-html/src/readme.js
new file mode 100755
index 0000000..18b0e62
--- /dev/null
+++ b/packages/gitbook-html/src/readme.js
@@ -0,0 +1,18 @@
+const dom = require('./dom');
+
+/**
+ * Parse an HTML content into metadata about a readme
+ *
+ * @param {String} html
+ * @return {Object}
+ */
+function parseReadme(html) {
+    const $ = dom.parse(html);
+
+    return {
+        title: $('h1:first-child').text().trim(),
+        description: $('div.paragraph,p').first().text().trim()
+    };
+}
+
+module.exports = parseReadme;
diff --git a/packages/gitbook-html/src/summary.js b/packages/gitbook-html/src/summary.js
new file mode 100755
index 0000000..1dda344
--- /dev/null
+++ b/packages/gitbook-html/src/summary.js
@@ -0,0 +1,148 @@
+const dom = require('./dom');
+
+const SELECTOR_LIST = 'ol, ul';
+const SELECTOR_LINK = '> a, p > a';
+const SELECTOR_PART = 'h2, h3, h4';
+
+/**
+ * Find a list.
+ * @param {cheerio.Node}
+ * @return {cheerio.Node}
+ */
+function findList($parent) {
+    const $container = $parent.children('.olist');
+    if ($container.length > 0) $parent = $container.first();
+
+    return $parent.children(SELECTOR_LIST);
+}
+
+/**
+ * Parse a ul list and return list of chapters recursvely.
+ * @param {cheerio.Node}
+ * @param {cheerio.DOM}
+ * @return {Array}
+ */
+function parseList($ul, $) {
+    const articles = [];
+
+    $ul.children('li').each(function() {
+        const article = {};
+        const $li = $(this);
+
+        // Get text for the entry
+        const $p = $li.children('p');
+        article.title = ($p.text() || dom.textNode($li.get(0))).trim();
+
+        // Parse link
+        const $a = $li.find(SELECTOR_LINK);
+        if ($a.length > 0) {
+            article.title = $a.first().text();
+            article.ref = $a.attr('href').replace(/\\/g, '/').replace(/^\/+/, '');
+        }
+
+        // Sub articles
+        const $sub = findList($li);
+        article.articles = parseList($sub, $);
+
+        if (!article.title) return;
+        articles.push(article);
+    });
+
+    return articles;
+}
+
+/**
+ * Find all parts and their corresponding lists.
+ * @param {cheerio.Node}
+ * @param {cheerio.DOM}
+ * @return {Array<{title: String, list: cheerio.Node}>}
+ */
+function findParts($parent, $) {
+    // Find parts and lists
+    // TODO asciidoc compatibility
+    const partsAndLists = $parent.children(SELECTOR_LIST + ', ' + SELECTOR_PART);
+
+    // Group each part with the list after
+    const parts = [];
+    let previousPart = null;
+
+    partsAndLists.each((i, el) => {
+        if (isPartNode(el)) {
+            if (previousPart !== null) {
+                // The previous part was empty
+                parts.push(previousPart);
+            }
+            previousPart = {
+                title: getPartTitle(el, $),
+                list: null
+            };
+
+        } else { // It is a list
+            if (previousPart !== null) {
+                previousPart.list = el;
+            } else {
+                previousPart = {
+                    title: '',
+                    list: el
+                };
+            }
+            parts.push(previousPart);
+            previousPart = null;
+        }
+    });
+
+    // Last part might be empty
+    if (previousPart !== null) {
+        parts.push(previousPart);
+    }
+
+    return parts;
+}
+
+/**
+ * True if the element is a part.
+ * @param el
+ * @return {Boolean}
+  */
+function isPartNode(el) {
+    return SELECTOR_PART.indexOf(el.name) !== -1;
+}
+
+/**
+ * Parse the title of a part element.
+ * @param el
+ * @param {cheerio.DOM} $
+ * @return {String}
+ */
+function getPartTitle(el, $) {
+    return $(el).text().trim();
+}
+
+/**
+ * Parse an HTML content into a tree of articles/parts.
+ * @param {String} html
+ * @return {Object}
+ */
+function parseSummary(html) {
+    const $ = dom.parse(html);
+    const $root = dom.cleanup(dom.root($), $);
+
+    const parts = findParts($root, $);
+
+    // Parse each list
+    const parsedParts = [];
+    let part;
+    for (let i = 0; i < parts.length; ++i) {
+        part = parts[i];
+        parsedParts.push({
+            title: part.title,
+            articles: parseList($(part.list), $)
+        });
+    }
+
+    return {
+        parts: parsedParts
+    };
+}
+
+module.exports = parseSummary;
diff --git a/packages/gitbook-html/src/totext.js b/packages/gitbook-html/src/totext.js
new file mode 100644
index 0000000..6e71cd3
--- /dev/null
+++ b/packages/gitbook-html/src/totext.js
@@ -0,0 +1,172 @@
+
+/*
+    This class is extended by gitbook-markdown and gitbook-asciidoc
+    to generate back markdown/asciidoc from GitBook metadata.
+*/
+
+class ToText {
+    constructor(markup) {
+        Object.assign(this, markup);
+    }
+
+    // Break line
+    onBL() {
+        return '\n';
+    }
+
+    onText(text) {
+        return text;
+    }
+
+    onHR() {
+        return '<hr />';
+    }
+
+    // ---- TITLES
+
+    onTitleStart(level) {
+        return '<h' + level + '>';
+    }
+    onTitleEnd(level) {
+        return '</h' + level + '>';
+    }
+
+    // ---- PARAGRAPHS / SECTIONS
+    onParagraphStart() {
+        return '<p>';
+    }
+    onParagraphEnd() {
+        return '</p>';
+    }
+
+
+    onSection() {
+        return this.onBL();
+    }
+
+    // ---- LINKS
+    onLinkStart(href) {
+        return '<a href="' + href + '">';
+    }
+    onLinkEnd(href) {
+        return '</a>';
+    }
+
+    // ---- LISTS
+    onListItemStart(level) {
+        return this._spaces((level + 1) * 4) + '<li>';
+    }
+    onListItemEnd(level) {
+        return this._spaces((level + 1) * 4) + '</li>' + this.onBL();
+    }
+    onListStart(level) {
+        return this._spaces(level * 4) + '<ul>' + this.onBL();
+    }
+    onListEnd(level) {
+        return this._spaces(level * 4) + '</ul>' + this.onBL();
+    }
+
+    // ------ LANGS
+
+    langs(languages) {
+        let content = '';
+        content += this.onTitleStart(1) + this.onText('Languages') + this.onTitleEnd(1);
+        content += this.onSection();
+
+        content += this._summaryArticles(languages);
+
+        return content;
+    }
+
+    // ------ GLOSSARY
+
+    glossary(glossary) {
+        let content = '';
+
+        content += this.onTitleStart(1) + this.onText('Glossary') + this.onTitleEnd(1);
+        content += this.onSection();
+
+        glossary.forEach((entry) => {
+            content += this.onTitleStart(2) + this.onText(entry.name) + this.onTitleEnd(2);
+            content += this.onParagraphStart();
+            content += this.onText(entry.description);
+            content += this.onParagraphEnd();
+            content += this.onSection();
+        });
+
+        return content;
+    }
+
+    // ------ SUMMARY
+
+    _summaryArticle(article, level) {
+        let content = '';
+
+        content += this.onListItemStart(level);
+
+        if (article.ref) content += this.onLinkStart(article.ref);
+        content += this.onText(article.title);
+        if (article.ref) content += this.onLinkEnd(article.ref);
+        content += this.onBL();
+
+        if (article.articles && article.articles.length > 0) {
+            content += this._summaryArticles(article.articles, level + 1);
+        }
+
+        content += this.onListItemEnd(level);
+
+        return content;
+    }
+    _summaryArticles(articles, level) {
+        let content = '';
+
+        level = level || 0;
+
+        content += this.onListStart(level);
+        articles.forEach((article) => {
+            content += this._summaryArticle(article, level);
+        });
+        content += this.onListEnd(level);
+
+        return content;
+    }
+    _summaryPart(part) {
+        let content = '';
+
+        if (part.title) content += this.onTitleStart(2) + this.onText(part.title) + this.onTitleEnd(2);
+
+        content += this._summaryArticles(part.articles);
+
+        return content;
+    }
+
+    summary(summary) {
+        let content = '';
+
+        content += this.onTitleStart(1) + this.onText('Summary') + this.onTitleEnd(1);
+        content += this.onSection();
+
+        summary.parts.forEach((part, i) => {
+            const next = summary.parts[i + 1];
+
+            content += this._summaryPart(part);
+
+            if (next && !next.title) {
+                content += this.onBL() + this.onHR() + this.onBL();
+            } else {
+                content += this.onSection();
+            }
+
+        });
+
+        return content;
+    }
+
+    // ---- Utilities
+
+    _spaces(n, s) {
+        return Array(n + 1).join(s || ' ');
+    }
+}
+
+module.exports = ToText;