1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
|
var _ = require('lodash');
var cheerio = require('cheerio');
// Parse summary and returns a list of sections
function parseSummary(html) {
var sections = [];
var $ = cheerio.load(html);
// Find main container
var $body = getContainer($);
// Extract sections, and parse
var $lists = $body.find('> ul, > ol');
$lists.each(function() {
sections.push({
articles: parseList($(this), $)
});
});
return sections;
}
// Parse readme and extract title, description
function parseReadme(html) {
var $ = cheerio.load(html);
// Find main container
var $body = getContainer($);
return {
title: $body.find('h1:first-child').text().trim(),
description: $body.find('div.paragraph').first().text().trim()
};
}
// Return a page container (html, body tag or directly the root element)
function getContainer($) {
var $body = $('body, html').first();
if (!$body) $body = $;
return $body;
}
// Parse a ul list and return list of chapters recursvely
function parseList($ul, $) {
var articles = [];
$ul.children('li').each(function() {
var article = {};
var $li = $(this);
var $text = $li.find('> p, > span');
var $a = $li.find('> a, > p a, > span a');
article.title = $text.text();
if ($a.length > 0) {
article.title = $a.first().text();
article.ref = $a.attr('href');
}
// Inner list, with children article
var $sub = $li.find('> ol, > ul, > .olist > ol');
article.articles = parseList($sub, $);
articles.push(article);
});
return articles;
}
// Inherit from the html parser
function inherits(opts) {
var parser = _.defaults(opts, {
toHTML: _.identity
});
parser.readme = _.compose(opts.toHTML, parseReadme);
parser.summary = _.compose(opts.toHTML, parseSummary);
return parser;
}
module.exports = inherits({
extensions: ['.html']
});
module.exports.inherits = inherits;
|