summaryrefslogtreecommitdiffstats
path: root/lib/parsers/html.js
blob: 8f4ed34f708294eab129e7e74e7bdd90566b6c41 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
var _ = require('lodash');
var cheerio = require('cheerio');

// Parse summary and returns a list of sections
function parseSummary(html) {
    var sections = [];
    var $ = cheerio.load(html);

    // Find main container
    var $body = getContainer($);

    // Extract sections, and parse
    var $lists = $body.find('> ul, > ol');

    $lists.each(function() {
        sections.push({
            articles: parseList($(this), $)
        });
    });

    return sections;
}

// Parse readme and extract title, description
function parseReadme(html) {
    var $ = cheerio.load(html);

    // Find main container
    var $body = getContainer($);

    return {
        title: $body.find('h1:first-child').text().trim(),
        description: $body.find('div.paragraph').first().text().trim()
    };
}

// Return a page container (html, body tag or directly the root element)
function getContainer($) {
    var $body = $('body, html').first();
    if (!$body) $body = $;

    return $body;
}

// Parse a ul list and return list of chapters recursvely
function parseList($ul, $) {
    var articles = [];

    $ul.children('li').each(function() {
        var article = {};

        var $li = $(this);

        var $text = $li.find('> p, > span');
        var $a = $li.find('> a, > p a, > span a');

        article.title = $text.text();
        if ($a.length > 0) {
            article.title = $a.first().text();
            article.ref = $a.attr('href');
        }

        // Inner list, with children article
        var $sub = $li.find('> ol, > ul, > .olist > ol');
        article.articles = parseList($sub, $);

        articles.push(article);
    });

    return articles;
}


// Inherit from the html parser
function inherits(opts) {
    var parser = _.defaults(opts, {
        toHTML: _.identity
    });

    parser.readme = _.compose(opts.toHTML, parseReadme);
    parser.summary = _.compose(opts.toHTML, parseSummary);

    return parser;
}


module.exports = inherits({
    extensions: ['.html']
});
module.exports.inherits = inherits;