You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
184 lines
5.8 KiB
JavaScript
184 lines
5.8 KiB
JavaScript
3 years ago
|
/*
|
||
|
* to-markdown - an HTML to Markdown converter
|
||
|
*
|
||
|
* Copyright 2011, Dom Christie
|
||
|
* Licenced under the MIT licence
|
||
|
*
|
||
|
*/
|
||
|
|
||
|
var toMarkdown = function(string) {
|
||
|
|
||
|
var ELEMENTS = [
|
||
|
{
|
||
|
patterns: 'p',
|
||
|
replacement: function(str, attrs, innerHTML) {
|
||
|
return innerHTML ? '\n\n' + innerHTML + '\n' : '';
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
patterns: 'br',
|
||
|
type: 'void',
|
||
|
replacement: '\n'
|
||
|
},
|
||
|
{
|
||
|
patterns: 'h([1-6])',
|
||
|
replacement: function(str, hLevel, attrs, innerHTML) {
|
||
|
var hPrefix = '';
|
||
|
for(var i = 0; i < hLevel; i++) {
|
||
|
hPrefix += '#';
|
||
|
}
|
||
|
return '\n\n' + hPrefix + ' ' + innerHTML + '\n';
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
patterns: 'hr',
|
||
|
type: 'void',
|
||
|
replacement: '\n\n* * *\n'
|
||
|
},
|
||
|
{
|
||
|
patterns: 'a',
|
||
|
replacement: function(str, attrs, innerHTML) {
|
||
|
var href = attrs.match(attrRegExp('href')),
|
||
|
title = attrs.match(attrRegExp('title'));
|
||
|
return href ? '[' + innerHTML + ']' + '(' + href[1] + (title && title[1] ? ' "' + title[1] + '"' : '') + ')' : str;
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
patterns: ['b', 'strong'],
|
||
|
replacement: function(str, attrs, innerHTML) {
|
||
|
return innerHTML ? '**' + innerHTML + '**' : '';
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
patterns: ['i', 'em'],
|
||
|
replacement: function(str, attrs, innerHTML) {
|
||
|
return innerHTML ? '_' + innerHTML + '_' : '';
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
patterns: 'code',
|
||
|
replacement: function(str, attrs, innerHTML) {
|
||
|
return innerHTML ? '`' + innerHTML + '`' : '';
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
patterns: 'img',
|
||
|
type: 'void',
|
||
|
replacement: function(str, attrs, innerHTML) {
|
||
|
var src = attrs.match(attrRegExp('src')),
|
||
|
alt = attrs.match(attrRegExp('alt')),
|
||
|
title = attrs.match(attrRegExp('title'));
|
||
|
return '![' + (alt && alt[1] ? alt[1] : '') + ']' + '(' + src[1] + (title && title[1] ? ' "' + title[1] + '"' : '') + ')';
|
||
|
}
|
||
|
}
|
||
|
];
|
||
|
|
||
|
for(var i = 0, len = ELEMENTS.length; i < len; i++) {
|
||
|
if(typeof ELEMENTS[i].patterns === 'string') {
|
||
|
string = replaceEls(string, { tag: ELEMENTS[i].patterns, replacement: ELEMENTS[i].replacement, type: ELEMENTS[i].type });
|
||
|
}
|
||
|
else {
|
||
|
for(var j = 0, pLen = ELEMENTS[i].patterns.length; j < pLen; j++) {
|
||
|
string = replaceEls(string, { tag: ELEMENTS[i].patterns[j], replacement: ELEMENTS[i].replacement, type: ELEMENTS[i].type });
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
function replaceEls(html, elProperties) {
|
||
|
var pattern = elProperties.type === 'void' ? '<' + elProperties.tag + '\\b([^>]*)\\/?>' : '<' + elProperties.tag + '\\b([^>]*)>([\\s\\S]*?)<\\/' + elProperties.tag + '>',
|
||
|
regex = new RegExp(pattern, 'gi'),
|
||
|
markdown = '';
|
||
|
if(typeof elProperties.replacement === 'string') {
|
||
|
markdown = html.replace(regex, elProperties.replacement);
|
||
|
}
|
||
|
else {
|
||
|
markdown = html.replace(regex, function(str, p1, p2, p3) {
|
||
|
return elProperties.replacement.call(this, str, p1, p2, p3);
|
||
|
});
|
||
|
}
|
||
|
return markdown;
|
||
|
}
|
||
|
|
||
|
function attrRegExp(attr) {
|
||
|
return new RegExp(attr + '\\s*=\\s*["\']?([^"\']*)["\']?', 'i');
|
||
|
}
|
||
|
|
||
|
// Pre code blocks
|
||
|
|
||
|
string = string.replace(/<pre\b[^>]*>`([\s\S]*)`<\/pre>/gi, function(str, innerHTML) {
|
||
|
innerHTML = innerHTML.replace(/^\t+/g, ' '); // convert tabs to spaces (you know it makes sense)
|
||
|
innerHTML = innerHTML.replace(/\n/g, '\n ');
|
||
|
return '\n\n ' + innerHTML + '\n';
|
||
|
});
|
||
|
|
||
|
// Lists
|
||
|
|
||
|
// Escape numbers that could trigger an ol
|
||
|
// If there are more than three spaces before the code, it would be in a pre tag
|
||
|
// Make sure we are escaping the period not matching any character
|
||
|
string = string.replace(/^(\s{0,3}\d+)\. /g, '$1\\. ');
|
||
|
|
||
|
// Converts lists that have no child lists (of same type) first, then works it's way up
|
||
|
var noChildrenRegex = /<(ul|ol)\b[^>]*>(?:(?!<ul|<ol)[\s\S])*?<\/\1>/gi;
|
||
|
while(string.match(noChildrenRegex)) {
|
||
|
string = string.replace(noChildrenRegex, function(str) {
|
||
|
return replaceLists(str);
|
||
|
});
|
||
|
}
|
||
|
|
||
|
function replaceLists(html) {
|
||
|
|
||
|
html = html.replace(/<(ul|ol)\b[^>]*>([\s\S]*?)<\/\1>/gi, function(str, listType, innerHTML) {
|
||
|
var lis = innerHTML.split('</li>');
|
||
|
lis.splice(lis.length - 1, 1);
|
||
|
|
||
|
for(i = 0, len = lis.length; i < len; i++) {
|
||
|
if(lis[i]) {
|
||
|
var prefix = (listType === 'ol') ? (i + 1) + ". " : "* ";
|
||
|
lis[i] = lis[i].replace(/\s*<li[^>]*>([\s\S]*)/i, function(str, innerHTML) {
|
||
|
|
||
|
innerHTML = innerHTML.replace(/^\s+/, '');
|
||
|
innerHTML = innerHTML.replace(/\n\n/g, '\n\n ');
|
||
|
// indent nested lists
|
||
|
innerHTML = innerHTML.replace(/\n([ ]*)+(\*|\d+\.) /g, '\n$1 $2 ');
|
||
|
return prefix + innerHTML;
|
||
|
});
|
||
|
}
|
||
|
}
|
||
|
return lis.join('\n');
|
||
|
});
|
||
|
return '\n\n' + html.replace(/[ \t]+\n|\s+$/g, '');
|
||
|
}
|
||
|
|
||
|
// Blockquotes
|
||
|
var deepest = /<blockquote\b[^>]*>((?:(?!<blockquote)[\s\S])*?)<\/blockquote>/gi;
|
||
|
while(string.match(deepest)) {
|
||
|
string = string.replace(deepest, function(str) {
|
||
|
return replaceBlockquotes(str);
|
||
|
});
|
||
|
}
|
||
|
|
||
|
function replaceBlockquotes(html) {
|
||
|
html = html.replace(/<blockquote\b[^>]*>([\s\S]*?)<\/blockquote>/gi, function(str, inner) {
|
||
|
inner = inner.replace(/^\s+|\s+$/g, '');
|
||
|
inner = cleanUp(inner);
|
||
|
inner = inner.replace(/^/gm, '> ');
|
||
|
inner = inner.replace(/^(>([ \t]{2,}>)+)/gm, '> >');
|
||
|
return inner;
|
||
|
});
|
||
|
return html;
|
||
|
}
|
||
|
|
||
|
function cleanUp(string) {
|
||
|
string = string.replace(/^[\t\r\n]+|[\t\r\n]+$/g, ''); // trim leading/trailing whitespace
|
||
|
string = string.replace(/\n\s+\n/g, '\n\n');
|
||
|
string = string.replace(/\n{3,}/g, '\n\n'); // limit consecutive linebreaks to 2
|
||
|
return string;
|
||
|
}
|
||
|
|
||
|
return cleanUp(string);
|
||
|
};
|
||
|
|
||
|
if (typeof exports === 'object') {
|
||
|
exports.toMarkdown = toMarkdown;
|
||
|
}
|