diff options
Diffstat (limited to 'packages/gitbook-html/src')
-rw-r--r-- | packages/gitbook-html/src/dom.js | 62 | ||||
-rwxr-xr-x | packages/gitbook-html/src/glossary.js | 30 | ||||
-rwxr-xr-x | packages/gitbook-html/src/index.js | 50 | ||||
-rwxr-xr-x | packages/gitbook-html/src/langs.js | 17 | ||||
-rwxr-xr-x | packages/gitbook-html/src/page.js | 12 | ||||
-rwxr-xr-x | packages/gitbook-html/src/readme.js | 18 | ||||
-rwxr-xr-x | packages/gitbook-html/src/summary.js | 148 | ||||
-rw-r--r-- | packages/gitbook-html/src/totext.js | 172 |
8 files changed, 509 insertions, 0 deletions
diff --git a/packages/gitbook-html/src/dom.js b/packages/gitbook-html/src/dom.js new file mode 100644 index 0000000..9c5e070 --- /dev/null +++ b/packages/gitbook-html/src/dom.js @@ -0,0 +1,62 @@ +const cheerio = require('cheerio'); + +/** + * Parse an HTML string and return its content. + * @param {String} + * @return {cheerio.DOM} + */ +function parse(html) { + const $ = cheerio.load(html); + const $el = $('html, body').first(); + + return $el.length > 0 ? $el : $; +} + +/** + * Return main element for a DOM. + * @param {cheerio.DOM} + * @return {cheerio.Node} + */ +function root($) { + const $el = $('html, body, > div').first(); + return $el.length > 0 ? $el : $.root(); +} + +/** + * Return text node of an element. + * @param {cheerio.Node} + * @return {String} + */ +function textNode($el) { + return $el.children.reduce( + (text, e) => { + if (e.type == 'text') text += e.data; + return text; + }, + '' + ); +} + +/** + * Cleanup a DOM by removing all useless divs. + * @param {cheerio.Node} + * @param {cheerio.DOM} + * @return {cheerio.Node} + */ +function cleanup($el, $) { + $el.find('div').each(function() { + const $div = $(this); + cleanup($div, $); + + $div.replaceWith($div.html()); + }); + + return $el; +} + +module.exports = { + parse, + textNode, + root, + cleanup +}; diff --git a/packages/gitbook-html/src/glossary.js b/packages/gitbook-html/src/glossary.js new file mode 100755 index 0000000..a4269fe --- /dev/null +++ b/packages/gitbook-html/src/glossary.js @@ -0,0 +1,30 @@ +const dom = require('./dom'); + +/** + * Parse an HTML content into a list of glossary entry. + * + * @param {String} html + * @return {Array} entries + */ +function parseGlossary(html) { + const $ = dom.parse(html); + + const entries = []; + + $('h2').each(function() { + const $heading = $(this); + const $next = $heading.next(); + const $p = $next.is('p') ? $next.first() : $next.find('p').first(); + + const entry = {}; + + entry.name = $heading.text(); + entry.description = $p.text(); + + entries.push(entry); + }); + + return entries; +} + +module.exports = parseGlossary; diff --git a/packages/gitbook-html/src/index.js b/packages/gitbook-html/src/index.js new file mode 100755 index 0000000..9d560f1 --- /dev/null +++ b/packages/gitbook-html/src/index.js @@ -0,0 +1,50 @@ +const ToText = require('./totext'); + +const htmlParser = { + summary: require('./summary'), + glossary: require('./glossary'), + langs: require('./langs'), + readme: require('./readme'), + page: require('./page') +}; + +// Compose a function with a transform function for the first argument only +function compose(toHTML, fn) { + return (...args) => { + args[0] = toHTML(args[0]); + return fn(...args); + }; +} + +/** + * Create a GitBook parser from an HTML converter. + * @param {Object} toHTML + * {Function} [toHTML.inline] + * {Function} [toHTML.block] + * @param {Object} toText + * @return {[type]} [description] + */ +function createParser(toHTML, toText = {}) { + const parser = { + summary: compose(toHTML.block, htmlParser.summary), + glossary: compose(toHTML.block, htmlParser.glossary), + langs: compose(toHTML.block, htmlParser.langs), + readme: compose(toHTML.block, htmlParser.readme), + page: compose(toHTML.block, htmlParser.page), + inline: compose(toHTML.inline, htmlParser.page) + }; + + const _toText = new ToText(toText); + + parser.summary.toText = summary => _toText.summary(summary); + parser.langs.toText = langs => _toText.langs(langs); + parser.glossary.toText = glossary => _toText.glossary(glossary); + + return parser; +} + +module.exports = createParser({ + block: html => html, + inline: html => html +}); +module.exports.createParser = createParser; diff --git a/packages/gitbook-html/src/langs.js b/packages/gitbook-html/src/langs.js new file mode 100755 index 0000000..2c3523f --- /dev/null +++ b/packages/gitbook-html/src/langs.js @@ -0,0 +1,17 @@ +const parseSummary = require('./summary'); + +/** + * Parse an HTML content into a list of language. + * @param {String} html + * @return {Array} + */ +function parseLangs(content) { + const parts = parseSummary(content).parts; + if (parts.length > 0) { + return parts[0].articles; + } + + return []; +} + +module.exports = parseLangs; diff --git a/packages/gitbook-html/src/page.js b/packages/gitbook-html/src/page.js new file mode 100755 index 0000000..c4982b5 --- /dev/null +++ b/packages/gitbook-html/src/page.js @@ -0,0 +1,12 @@ +/** + * Parse content of a page. + * @param {String} html + * @return {Object} + */ +function parsePage(html) { + return { + content: html + }; +} + +module.exports = parsePage; diff --git a/packages/gitbook-html/src/readme.js b/packages/gitbook-html/src/readme.js new file mode 100755 index 0000000..18b0e62 --- /dev/null +++ b/packages/gitbook-html/src/readme.js @@ -0,0 +1,18 @@ +const dom = require('./dom'); + +/** + * Parse an HTML content into metadata about a readme + * + * @param {String} html + * @return {Object} + */ +function parseReadme(html) { + const $ = dom.parse(html); + + return { + title: $('h1:first-child').text().trim(), + description: $('div.paragraph,p').first().text().trim() + }; +} + +module.exports = parseReadme; diff --git a/packages/gitbook-html/src/summary.js b/packages/gitbook-html/src/summary.js new file mode 100755 index 0000000..1dda344 --- /dev/null +++ b/packages/gitbook-html/src/summary.js @@ -0,0 +1,148 @@ +const dom = require('./dom'); + +const SELECTOR_LIST = 'ol, ul'; +const SELECTOR_LINK = '> a, p > a'; +const SELECTOR_PART = 'h2, h3, h4'; + +/** + * Find a list. + * @param {cheerio.Node} + * @return {cheerio.Node} + */ +function findList($parent) { + const $container = $parent.children('.olist'); + if ($container.length > 0) $parent = $container.first(); + + return $parent.children(SELECTOR_LIST); +} + +/** + * Parse a ul list and return list of chapters recursvely. + * @param {cheerio.Node} + * @param {cheerio.DOM} + * @return {Array} + */ +function parseList($ul, $) { + const articles = []; + + $ul.children('li').each(function() { + const article = {}; + const $li = $(this); + + // Get text for the entry + const $p = $li.children('p'); + article.title = ($p.text() || dom.textNode($li.get(0))).trim(); + + // Parse link + const $a = $li.find(SELECTOR_LINK); + if ($a.length > 0) { + article.title = $a.first().text(); + article.ref = $a.attr('href').replace(/\\/g, '/').replace(/^\/+/, ''); + } + + // Sub articles + const $sub = findList($li); + article.articles = parseList($sub, $); + + if (!article.title) return; + articles.push(article); + }); + + return articles; +} + +/** + * Find all parts and their corresponding lists. + * @param {cheerio.Node} + * @param {cheerio.DOM} + * @return {Array<{title: String, list: cheerio.Node}>} + */ +function findParts($parent, $) { + // Find parts and lists + // TODO asciidoc compatibility + const partsAndLists = $parent.children(SELECTOR_LIST + ', ' + SELECTOR_PART); + + // Group each part with the list after + const parts = []; + let previousPart = null; + + partsAndLists.each((i, el) => { + if (isPartNode(el)) { + if (previousPart !== null) { + // The previous part was empty + parts.push(previousPart); + } + previousPart = { + title: getPartTitle(el, $), + list: null + }; + + } else { // It is a list + if (previousPart !== null) { + previousPart.list = el; + } else { + previousPart = { + title: '', + list: el + }; + } + parts.push(previousPart); + previousPart = null; + } + }); + + // Last part might be empty + if (previousPart !== null) { + parts.push(previousPart); + } + + return parts; +} + +/** + * True if the element is a part. + * @param el + * @return {Boolean} + */ +function isPartNode(el) { + return SELECTOR_PART.indexOf(el.name) !== -1; +} + +/** + * Parse the title of a part element. + * @param el + * @param {cheerio.DOM} $ + * @return {String} + */ +function getPartTitle(el, $) { + return $(el).text().trim(); +} + +/** + * Parse an HTML content into a tree of articles/parts. + * @param {String} html + * @return {Object} + */ +function parseSummary(html) { + const $ = dom.parse(html); + const $root = dom.cleanup(dom.root($), $); + + const parts = findParts($root, $); + + // Parse each list + const parsedParts = []; + let part; + for (let i = 0; i < parts.length; ++i) { + part = parts[i]; + parsedParts.push({ + title: part.title, + articles: parseList($(part.list), $) + }); + } + + return { + parts: parsedParts + }; +} + +module.exports = parseSummary; diff --git a/packages/gitbook-html/src/totext.js b/packages/gitbook-html/src/totext.js new file mode 100644 index 0000000..6e71cd3 --- /dev/null +++ b/packages/gitbook-html/src/totext.js @@ -0,0 +1,172 @@ + +/* + This class is extended by gitbook-markdown and gitbook-asciidoc + to generate back markdown/asciidoc from GitBook metadata. +*/ + +class ToText { + constructor(markup) { + Object.assign(this, markup); + } + + // Break line + onBL() { + return '\n'; + } + + onText(text) { + return text; + } + + onHR() { + return '<hr />'; + } + + // ---- TITLES + + onTitleStart(level) { + return '<h' + level + '>'; + } + onTitleEnd(level) { + return '</h' + level + '>'; + } + + // ---- PARAGRAPHS / SECTIONS + onParagraphStart() { + return '<p>'; + } + onParagraphEnd() { + return '</p>'; + } + + + onSection() { + return this.onBL(); + } + + // ---- LINKS + onLinkStart(href) { + return '<a href="' + href + '">'; + } + onLinkEnd(href) { + return '</a>'; + } + + // ---- LISTS + onListItemStart(level) { + return this._spaces((level + 1) * 4) + '<li>'; + } + onListItemEnd(level) { + return this._spaces((level + 1) * 4) + '</li>' + this.onBL(); + } + onListStart(level) { + return this._spaces(level * 4) + '<ul>' + this.onBL(); + } + onListEnd(level) { + return this._spaces(level * 4) + '</ul>' + this.onBL(); + } + + // ------ LANGS + + langs(languages) { + let content = ''; + content += this.onTitleStart(1) + this.onText('Languages') + this.onTitleEnd(1); + content += this.onSection(); + + content += this._summaryArticles(languages); + + return content; + } + + // ------ GLOSSARY + + glossary(glossary) { + let content = ''; + + content += this.onTitleStart(1) + this.onText('Glossary') + this.onTitleEnd(1); + content += this.onSection(); + + glossary.forEach((entry) => { + content += this.onTitleStart(2) + this.onText(entry.name) + this.onTitleEnd(2); + content += this.onParagraphStart(); + content += this.onText(entry.description); + content += this.onParagraphEnd(); + content += this.onSection(); + }); + + return content; + } + + // ------ SUMMARY + + _summaryArticle(article, level) { + let content = ''; + + content += this.onListItemStart(level); + + if (article.ref) content += this.onLinkStart(article.ref); + content += this.onText(article.title); + if (article.ref) content += this.onLinkEnd(article.ref); + content += this.onBL(); + + if (article.articles && article.articles.length > 0) { + content += this._summaryArticles(article.articles, level + 1); + } + + content += this.onListItemEnd(level); + + return content; + } + _summaryArticles(articles, level) { + let content = ''; + + level = level || 0; + + content += this.onListStart(level); + articles.forEach((article) => { + content += this._summaryArticle(article, level); + }); + content += this.onListEnd(level); + + return content; + } + _summaryPart(part) { + let content = ''; + + if (part.title) content += this.onTitleStart(2) + this.onText(part.title) + this.onTitleEnd(2); + + content += this._summaryArticles(part.articles); + + return content; + } + + summary(summary) { + let content = ''; + + content += this.onTitleStart(1) + this.onText('Summary') + this.onTitleEnd(1); + content += this.onSection(); + + summary.parts.forEach((part, i) => { + const next = summary.parts[i + 1]; + + content += this._summaryPart(part); + + if (next && !next.title) { + content += this.onBL() + this.onHR() + this.onBL(); + } else { + content += this.onSection(); + } + + }); + + return content; + } + + // ---- Utilities + + _spaces(n, s) { + return Array(n + 1).join(s || ' '); + } +} + +module.exports = ToText; |