summaryrefslogtreecommitdiffstats
path: root/lib/parsers/html.js
diff options
context:
space:
mode:
Diffstat (limited to 'lib/parsers/html.js')
-rw-r--r--lib/parsers/html.js90
1 files changed, 90 insertions, 0 deletions
diff --git a/lib/parsers/html.js b/lib/parsers/html.js
new file mode 100644
index 0000000..8f4ed34
--- /dev/null
+++ b/lib/parsers/html.js
@@ -0,0 +1,90 @@
+var _ = require('lodash');
+var cheerio = require('cheerio');
+
+// Parse summary and returns a list of sections
+function parseSummary(html) {
+ var sections = [];
+ var $ = cheerio.load(html);
+
+ // Find main container
+ var $body = getContainer($);
+
+ // Extract sections, and parse
+ var $lists = $body.find('> ul, > ol');
+
+ $lists.each(function() {
+ sections.push({
+ articles: parseList($(this), $)
+ });
+ });
+
+ return sections;
+}
+
+// Parse readme and extract title, description
+function parseReadme(html) {
+ var $ = cheerio.load(html);
+
+ // Find main container
+ var $body = getContainer($);
+
+ return {
+ title: $body.find('h1:first-child').text().trim(),
+ description: $body.find('div.paragraph').first().text().trim()
+ };
+}
+
+// Return a page container (html, body tag or directly the root element)
+function getContainer($) {
+ var $body = $('body, html').first();
+ if (!$body) $body = $;
+
+ return $body;
+}
+
+// Parse a ul list and return list of chapters recursvely
+function parseList($ul, $) {
+ var articles = [];
+
+ $ul.children('li').each(function() {
+ var article = {};
+
+ var $li = $(this);
+
+ var $text = $li.find('> p, > span');
+ var $a = $li.find('> a, > p a, > span a');
+
+ article.title = $text.text();
+ if ($a.length > 0) {
+ article.title = $a.first().text();
+ article.ref = $a.attr('href');
+ }
+
+ // Inner list, with children article
+ var $sub = $li.find('> ol, > ul, > .olist > ol');
+ article.articles = parseList($sub, $);
+
+ articles.push(article);
+ });
+
+ return articles;
+}
+
+
+// Inherit from the html parser
+function inherits(opts) {
+ var parser = _.defaults(opts, {
+ toHTML: _.identity
+ });
+
+ parser.readme = _.compose(opts.toHTML, parseReadme);
+ parser.summary = _.compose(opts.toHTML, parseSummary);
+
+ return parser;
+}
+
+
+module.exports = inherits({
+ extensions: ['.html']
+});
+module.exports.inherits = inherits;