1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
|
var _ = require("lodash");
var kramed = require('kramed');
var textRenderer = require('kramed-text-renderer');
var entryId = require('../../parse/glossary').entryId;
function Indexer(glossary) {
if(!(this instanceof Indexer)) {
return new Indexer(glossary);
}
_.bindAll(this);
this.glossary = glossary || [];
this.glossaryTerms = _.pluck(this.glossary, "id");
// Regex for searching for terms through body
this.termsRegex = new RegExp(
// Match any of the terms
"("+
this.glossaryTerms.map(regexEscape).join('|') +
")",
// Flags
"gi"
);
// debug
console.log('term regex =', this.termsRegex);
// page url => terms
this.idx = {
/*
"a/b.html": ["one word", "second word"]
*/
};
// term => page urls
this.invertedIdx = {
/*
"word1": ["page1.html", "page2.html"]
*/
};
// Use text renderer
this.renderer = textRenderer();
}
Indexer.prototype.text = function(nodes) {
// Copy section
var section = _.toArray(nodes);
// kramed's Render expects this, we don't use it yet
section.links = {};
var options = _.extend({}, kramed.defaults, {
renderer: this.renderer
});
return kramed.parser(section, options);
};
// Add page to glossary index
Indexer.prototype.add = function(sections, url) {
if(!(this.glossary && this.glossary.length > 0)) {
console.log('Glossary =', this.glossary);
console.log('No glossary to match');
return;
}
var textblob =
_.where(sections, { type: 'normal' })
.map(this.text)
.join('\n');
var matches = _(textblob.match(this.termsRegex) || [])
.map(entryId)
.uniq()
.value();
// Add idx for book
this.idx[url] = matches;
// Add to inverted idx
matches.forEach(function(match) {
if(!this.invertedIdx[match]) {
this.invertedIdx[match] = [];
}
this.invertedIdx[match].push(url);
}.bind(this));
};
function regexEscape(s) {
return s.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&');
}
// Exports
module.exports = Indexer;
|