diff options
author | Samy Pessé <samypesse@gmail.com> | 2016-02-19 16:12:43 +0100 |
---|---|---|
committer | Samy Pessé <samypesse@gmail.com> | 2016-12-22 12:32:14 +0100 |
commit | 4c44d677117d926b6dcc164f55fe34079c2ca3c7 (patch) | |
tree | 71d0fca56421f3a7b68d60a446f89d3735828f03 /packages/gitbook-html | |
parent | 9e99b5850fd866fc2f9196993a0ae7e342311558 (diff) | |
download | gitbook-4c44d677117d926b6dcc164f55fe34079c2ca3c7.zip gitbook-4c44d677117d926b6dcc164f55fe34079c2ca3c7.tar.gz gitbook-4c44d677117d926b6dcc164f55fe34079c2ca3c7.tar.bz2 |
Improve summary parser
Diffstat (limited to 'packages/gitbook-html')
-rw-r--r-- | packages/gitbook-html/lib/dom.js | 11 | ||||
-rwxr-xr-x | packages/gitbook-html/lib/glossary.js | 5 | ||||
-rwxr-xr-x | packages/gitbook-html/lib/index.js | 27 | ||||
-rwxr-xr-x | packages/gitbook-html/lib/langs.js | 4 | ||||
-rwxr-xr-x | packages/gitbook-html/lib/page.js | 6 | ||||
-rwxr-xr-x | packages/gitbook-html/lib/summary.js | 97 | ||||
-rw-r--r-- | packages/gitbook-html/package.json | 4 | ||||
-rwxr-xr-x | packages/gitbook-html/test/fixtures/SUMMARY.html | 14 | ||||
-rwxr-xr-x | packages/gitbook-html/test/glossary.js | 10 | ||||
-rwxr-xr-x | packages/gitbook-html/test/langs.js | 10 | ||||
-rwxr-xr-x | packages/gitbook-html/test/readme.js | 10 | ||||
-rwxr-xr-x | packages/gitbook-html/test/summary.js | 50 |
12 files changed, 168 insertions, 80 deletions
diff --git a/packages/gitbook-html/lib/dom.js b/packages/gitbook-html/lib/dom.js index 2c2eaf7..df4de90 100644 --- a/packages/gitbook-html/lib/dom.js +++ b/packages/gitbook-html/lib/dom.js @@ -3,12 +3,18 @@ var cheerio = require('cheerio'); // Parse an HTML string and return its content function parse(html) { - var $ = cheerio.load('<div>'+html+'</div>'); + var $ = cheerio.load(html); var $el = $('html, body').first(); return $el.length > 0? $el : $; } +// Return main element +function root($) { + var $el = $('html, body, > div').first(); + return $el.length > 0? $el : $.root(); +} + // Return text node of an element function textNode($el) { return _.reduce($el.children, function(text, e) { @@ -19,5 +25,6 @@ function textNode($el) { module.exports = { parse: parse, - textNode: textNode + textNode: textNode, + root: root }; diff --git a/packages/gitbook-html/lib/glossary.js b/packages/gitbook-html/lib/glossary.js index 648ba1a..9d3799b 100755 --- a/packages/gitbook-html/lib/glossary.js +++ b/packages/gitbook-html/lib/glossary.js @@ -7,9 +7,10 @@ function parseGlossary(html) { var entries = []; - $("h2").each(function() { + $('h2').each(function() { var $heading = $(this); - var $p = $heading.next(); + var $next = $heading.next() + var $p = $next.is('p')? $next.first() : $next.find('p').first(); var entry = {}; diff --git a/packages/gitbook-html/lib/index.js b/packages/gitbook-html/lib/index.js index a7c478c..0e67c94 100755 --- a/packages/gitbook-html/lib/index.js +++ b/packages/gitbook-html/lib/index.js @@ -1,8 +1,33 @@ +var _ = require('lodash'); -module.exports = { +var htmlParser = { summary: require("./summary"), glossary: require("./glossary"), langs: require("./langs"), readme: require("./readme"), page: require("./page") }; + +// Compose a function with a transform function for the first args +function compose(toHTML, fn) { + return function() { + var args = _.toArray(arguments); + args[0] = toHTML(args[0]); + + return fn.apply(null, args); + } +} + +// Create a GitBook parser +function createParser(toHTML) { + return { + summary: compose(toHTML, htmlParser.summary), + glossary: compose(toHTML, htmlParser.glossary), + langs: compose(toHTML, htmlParser.langs), + readme: compose(toHTML, htmlParser.readme), + page: compose(toHTML, htmlParser.page) + } +} + +module.exports = htmlParser; +module.exports.createParser = createParser; diff --git a/packages/gitbook-html/lib/langs.js b/packages/gitbook-html/lib/langs.js index 1042dcb..270a9f6 100755 --- a/packages/gitbook-html/lib/langs.js +++ b/packages/gitbook-html/lib/langs.js @@ -1,9 +1,9 @@ var _ = require('lodash'); -var parseEntries = require('./summary').entries; +var parseSummary = require('./summary'); // HTML -> Languages function parseLangs(content) { - return parseEntries(content); + return parseSummary(content).parts[0].articles; } // Languages -> HTML diff --git a/packages/gitbook-html/lib/page.js b/packages/gitbook-html/lib/page.js index e687050..6f056fc 100755 --- a/packages/gitbook-html/lib/page.js +++ b/packages/gitbook-html/lib/page.js @@ -2,8 +2,10 @@ var Q = require('q'); var _ = require('lodash'); // HTML -> HTML -function parsePage(src) { - return src; +function parsePage(html) { + return { + content: html + }; } module.exports = parsePage; diff --git a/packages/gitbook-html/lib/summary.js b/packages/gitbook-html/lib/summary.js index 1e2d63d..e71d6b5 100755 --- a/packages/gitbook-html/lib/summary.js +++ b/packages/gitbook-html/lib/summary.js @@ -1,14 +1,17 @@ var _ = require('lodash'); var dom = require('./dom'); +var SELECTOR_LIST = '.olist > ol, ol, ul'; +var SELECTOR_LINK = 'a, p > a'; + +var BL = '\n'; // parse a ul list and return list of chapters recursvely function parseList($ul, $) { var articles = []; - $ul.children('>li').each(function() { + $ul.children('li').each(function() { var article = {}; - var $li = $(this); // Get text for the entry @@ -16,14 +19,14 @@ function parseList($ul, $) { article.title = $p.text() || dom.textNode($li.get(0)); // Parse link - var $a = $li.find('> a, > p > a'); + var $a = $li.children(SELECTOR_LINK); if ($a.length > 0) { article.title = $a.first().text(); article.path = $a.attr('href').replace(/\\/g, '/').replace(/^\/+/, '') } // Sub articles - var $sub = $li.children('> .olist > ol, > ol, > ul'); + var $sub = $li.children(SELECTOR_LIST).first(); article.articles = parseList($sub, $); articles.push(article); @@ -32,55 +35,73 @@ function parseList($ul, $) { return articles; } -// Return a list of entries in a div -function parseEntries (html) { +// HTML -> Summary +function parseSummary(html) { var $ = dom.parse(html); - var chapters = parseList($("> ol, > ul").first(), $); - return chapters; -} + var $root = dom.root($); -// HTML -> Summary -function parseSummary(src) { - var chapters = parseEntries(src); + var $lists = $root.children(SELECTOR_LIST); + var parts = []; + + $lists.each(function() { + var $list = $(this); + + parts.push({ + articles: parseList($(SELECTOR_LIST).first(), $) + }); + }); return { - chapters: chapters + parts: parts }; } // Summary -> HTML -function summaryToText(summary) { - var bl = '\n'; - - var _base = function(article) { - if (article.path) { - return '<a href="'+article.path+'">'+article.title+'</a>'; - } else { - return article.title; - } - }; +function textPrefix(d) { + return Array(d*4).join(' '); +} - var convertArticle = function(article, d) { - var content = Array(d+2).join(' ') + '<li>' + _base(article); +function articleToText(article, d) { + var prefix = textPrefix(d); + var content = prefix + '<li>'; - if (article.articles.length > 0) { - content += convertArticles(article.articles, d); - } - return content + '</li>' + bl; - }; + if (article.path) { + content += '<a href="'+article.path+'">'+article.title+'</a>'; + } else { + content += article.title; + } - var convertArticles = function(articles, d) { - var content = '<ul>' + bl; - _.each(articles, function(_article) { - content += convertArticle(_article, d + 1); - }); - return content + '<ul>' + bl; + if (article.articles.length > 0) { + content += BL + articlesToText(article.articles, d) + prefix; } + content += '</li>' + BL; + + return content; +} + +function articlesToText(articles, d) { + var prefix = textPrefix(d); + var content = prefix + '<ul>' + BL; + _.each(articles, function(_article) { + content += articleToText(_article, d + 1); + }); + return content + '</ul>' + BL; +} + +function partsToText(part) { + return articlesToText(part.articles, 0) + BL + BL; +} + +function summaryToText(summary) { + var content = '<h1>Summary</h1>' + BL; + + _.each(summary.parts, function(part) { + content += partsToText(part); + }); - return '<h1>Summary</h1>'+ bl+bl + convertArticles(summary.chapters, 0) + bl; + return content + BL; }; module.exports = parseSummary; -module.exports.entries = parseEntries; module.exports.toText = summaryToText; diff --git a/packages/gitbook-html/package.json b/packages/gitbook-html/package.json index ff28ca6..a21259b 100644 --- a/packages/gitbook-html/package.json +++ b/packages/gitbook-html/package.json @@ -7,13 +7,13 @@ "dependencies": { "q": "^1.1.2", "lodash": "^3.2.0", - "cheerio": "^0.19.0" + "cheerio": "^0.20.0 && >=0.20.0" }, "devDependencies": { "mocha": "^2.3.2" }, "scripts": { - "test": "export TESTING=true; mocha --reporter list --bail" + "test": "export TESTING=true; mocha --reporter spec --bail" }, "repository": { "type": "git", diff --git a/packages/gitbook-html/test/fixtures/SUMMARY.html b/packages/gitbook-html/test/fixtures/SUMMARY.html index f469249..bae97f3 100755 --- a/packages/gitbook-html/test/fixtures/SUMMARY.html +++ b/packages/gitbook-html/test/fixtures/SUMMARY.html @@ -23,4 +23,16 @@ </ul> </li> <li>Unfinished Chapter</li> -</ul>
\ No newline at end of file +</ul> + +<ul> + <li> + <a href="chapter-1/README.md">Chapter 1</a> + </li> +</ul> + +<ul> + <li> + <a href="chapter-1/README.md">Chapter 1</a> + </li> +</ul> diff --git a/packages/gitbook-html/test/glossary.js b/packages/gitbook-html/test/glossary.js index 250c6f1..8bd77d6 100755 --- a/packages/gitbook-html/test/glossary.js +++ b/packages/gitbook-html/test/glossary.js @@ -4,10 +4,14 @@ var assert = require('assert'); var glossary = require('../').glossary; -var CONTENT = fs.readFileSync(path.join(__dirname, './fixtures/GLOSSARY.html'), 'utf8'); -var LEXED = glossary(CONTENT); - describe('Glossary parsing', function () { + var LEXED; + + before(function() { + var CONTENT = fs.readFileSync(path.join(__dirname, './fixtures/GLOSSARY.html'), 'utf8'); + LEXED = glossary(CONTENT); + }); + it('should only get heading + paragraph pairs', function() { assert.equal(LEXED.length, 5); }); diff --git a/packages/gitbook-html/test/langs.js b/packages/gitbook-html/test/langs.js index c51cf2d..6b5e00b 100755 --- a/packages/gitbook-html/test/langs.js +++ b/packages/gitbook-html/test/langs.js @@ -4,10 +4,14 @@ var assert = require('assert'); var langs = require('../').langs; -var CONTENT = fs.readFileSync(path.join(__dirname, './fixtures/LANGS.html'), 'utf8'); -var LEXED = langs(CONTENT); - describe('Languages parsing', function () { + var LEXED; + + before(function() { + var CONTENT = fs.readFileSync(path.join(__dirname, './fixtures/LANGS.html'), 'utf8'); + LEXED = langs(CONTENT); + }); + it('should detect paths and titles', function() { assert.equal(LEXED[0].path,'en/'); assert.equal(LEXED[0].title,'English'); diff --git a/packages/gitbook-html/test/readme.js b/packages/gitbook-html/test/readme.js index 9d9ca29..f38f40b 100755 --- a/packages/gitbook-html/test/readme.js +++ b/packages/gitbook-html/test/readme.js @@ -4,11 +4,13 @@ var assert = require('assert'); var readme = require('../').readme; - -var CONTENT = fs.readFileSync(path.join(__dirname, './fixtures/README.html'), 'utf8'); -var LEXED = readme(CONTENT); - describe('Readme parsing', function () { + var LEXED; + + before(function() { + var CONTENT = fs.readFileSync(path.join(__dirname, './fixtures/README.html'), 'utf8'); + LEXED = readme(CONTENT); + }); it('should contain a title', function() { assert(LEXED.title); diff --git a/packages/gitbook-html/test/summary.js b/packages/gitbook-html/test/summary.js index 8d686fc..4d06c32 100755 --- a/packages/gitbook-html/test/summary.js +++ b/packages/gitbook-html/test/summary.js @@ -4,37 +4,47 @@ var assert = require('assert'); var summary = require('../').summary; -var CONTENT = fs.readFileSync(path.join(__dirname, './fixtures/SUMMARY.html'), 'utf8'); -var LEXED = summary(CONTENT); describe('Summary parsing', function () { + var LEXED, PART; + + before(function() { + var CONTENT = fs.readFileSync(path.join(__dirname, './fixtures/SUMMARY.html'), 'utf8'); + LEXED = summary(CONTENT); + PART = LEXED.parts[0]; + }); + + it('should detect parts', function() { + assert.equal(LEXED.parts.length, 3); + }); + it('should detect chapters', function() { - assert.equal(LEXED.chapters.length, 5); + assert.equal(PART.articles.length, 5); }); it('should support articles', function() { - assert.equal(LEXED.chapters[0].articles.length, 2); - assert.equal(LEXED.chapters[1].articles.length, 0); - assert.equal(LEXED.chapters[2].articles.length, 0); + assert.equal(PART.articles[0].articles.length, 2); + assert.equal(PART.articles[1].articles.length, 0); + assert.equal(PART.articles[2].articles.length, 0); }); it('should detect paths and titles', function() { - assert(LEXED.chapters[0].path); - assert(LEXED.chapters[1].path); - assert(LEXED.chapters[2].path); - assert(LEXED.chapters[3].path); - assert.equal(LEXED.chapters[4].path, null); - - assert(LEXED.chapters[0].title); - assert(LEXED.chapters[1].title); - assert(LEXED.chapters[2].title); - assert(LEXED.chapters[3].title); - assert(LEXED.chapters[4].title); + assert(PART.articles[0].path); + assert(PART.articles[1].path); + assert(PART.articles[2].path); + assert(PART.articles[3].path); + assert.equal(PART.articles[4].path, null); + + assert(PART.articles[0].title); + assert(PART.articles[1].title); + assert(PART.articles[2].title); + assert(PART.articles[3].title); + assert(PART.articles[4].title); }); it('should normalize paths from .md', function() { - assert.equal(LEXED.chapters[0].path,'chapter-1/README.md'); - assert.equal(LEXED.chapters[1].path,'chapter-2/README.md'); - assert.equal(LEXED.chapters[2].path,'chapter-3/README.md'); + assert.equal(PART.articles[0].path,'chapter-1/README.md'); + assert.equal(PART.articles[1].path,'chapter-2/README.md'); + assert.equal(PART.articles[2].path,'chapter-3/README.md'); }); it('should correctly convert it to text', function() { |