QQ收藏自动提取正文分析

来源:互联网 发布:淘宝宝贝属性 编辑:程序博客网 时间:2024/04/28 14:16

说明

QQ收藏可以自动提取网页正文。如果采集使用上这个功能,将会省去很多写采集规则的时间。
现在将简单介绍一下QQ收藏的提取算法,权当抛砖引玉,提供一种思路。

参数

QQ收藏提取正文参考了很多参数,包括正文常见标签/元素宽高/文本长度等。

忽略的标签

IGNORE_TAGS : ["A", "DD", "DT", "OL", "OPTION", "DL", "DD", "SCRIPT", "STYLE", "UL", "LI", "IFRAME"],

提取标题用到的标签

TITLE_TAGS : ["H1", "H2", "H3", "H4", "H5", "H6"],

正文可能性极小的标签

MINOR_REGEXP : /comment|combx|disqus|foot|header|menu|rss|shoutbox|sidebar|sponsor/i,

正文可能性很大的标签

MAJOR_REGEXP : /article|entry|post|body|column|main|content/i,

权重算法

根据参数来算权重,权重越大,正文的可能性也越大。

计算结构权重

calcStructureWeight : function () {    var j = 0;    for (var h = 0, d = this._texts.length; h < d; h++) {        var i = this._texts[h],        f = commonTool.trim(i.nodeValue).length,        g = 1;        if (f < 20) {            continue        }        for (var e = i.parentNode; e && e != this.elem; e = e.parentNode) {            g -= 0.1        }        j += Math.pow(g * f, 1.25)    }    return j},

计算内容权重

calcContentWeight : function () {    var d = 1;    for (var e = this.elem; e; e = e.parentNode) {        if (e.id) {            if (ArticleConfig.clipperArticleConfig.MAJOR_REGEXP.test(e.id)) {                d += 0.4            }            if (ArticleConfig.clipperArticleConfig.MINOR_REGEXP.test(e.id)) {                d -= 0.8            }        }        if (e.className) {            if (ArticleConfig.clipperArticleConfig.MAJOR_REGEXP.test(e.className)) {                d += 0.4            }            if (ArticleConfig.clipperArticleConfig.MINOR_REGEXP.test(e.className)) {                d -= 0.8            }        }    }    return d},

提取正文

MyCollectionDefine("Article/Page", function (b, a) {    var d = b("Article/Article");    var c = function (e) {        this.contentDocument = e    };    c.prototype = {        getMainArticle : function () {            var e = this._getMainArticle();            if (!e) {                return null            }            return e        },        _getAllArticle : function () {            var h = this.contentDocument.getElementsByTagName("*");            var i = [];            var g = h.length;            for (var e = 0; e < g; ++e) {                var f = h[e];                if (!this._checkIgnoreTagName(f.tagName) && this._checkVisibility(f) && this._checkSize(f)) {                    i[i.length] = new d(f)                }            }            return i        },        _checkIgnoreTagName : function (e) {            return Boolean(commonTool.indexOf(ArticleConfig.clipperPageConfig.ignoreTag, e) != -1)        },        _checkVisibility : function (e) {            if (!e) {                return false            }            var f = commonTool.getComputedStyle(e, null, null);            return !(commonTool.css(f, "visibility") == "hidden" || commonTool.css(f, "display") == "none" || parseInt(commonTool.css(f, "height")) <= 0 || parseInt(commonTool.css(f, "width")) <= 0)        },        _checkSize : function (e) {            return e.offsetWidth > ArticleConfig.clipperPageConfig.elemOffsetWidth && e.offsetHeight > ArticleConfig.clipperPageConfig.elemOffsetHeight        },        _getMainArticle : function () {            var e = null;            var f = this._getAllArticle();            f.sort(function (h, g) {                return g.weight - h.weight            });            e = f[0];            if (f.length >= 2 && f[1].weight > 400 && (f[0].weight - f[1].weight) * 100 / f[0].weight < 15) {                if (f[1]._texts.length > f[0]._texts.length) {                    e = f[1]                }            }            if (e.weight < 400) {                e = null            }            return e        }    };    return c});

关于这块的完整代码:

MyCollectionDefine("Article/Page", function (b, a) {    var d = b("Article/Article");    var c = function (e) {        this.contentDocument = e    };    c.prototype = {        getMainArticle : function () {            var e = this._getMainArticle();            if (!e) {                return null            }            return e        },        _getAllArticle : function () {            var h = this.contentDocument.getElementsByTagName("*");            var i = [];            var g = h.length;            for (var e = 0; e < g; ++e) {                var f = h[e];                if (!this._checkIgnoreTagName(f.tagName) && this._checkVisibility(f) && this._checkSize(f)) {                    i[i.length] = new d(f)                }            }            return i        },        _checkIgnoreTagName : function (e) {            return Boolean(commonTool.indexOf(ArticleConfig.clipperPageConfig.ignoreTag, e) != -1)        },        _checkVisibility : function (e) {            if (!e) {                return false            }            var f = commonTool.getComputedStyle(e, null, null);            return !(commonTool.css(f, "visibility") == "hidden" || commonTool.css(f, "display") == "none" || parseInt(commonTool.css(f, "height")) <= 0 || parseInt(commonTool.css(f, "width")) <= 0)        },        _checkSize : function (e) {            return e.offsetWidth > ArticleConfig.clipperPageConfig.elemOffsetWidth && e.offsetHeight > ArticleConfig.clipperPageConfig.elemOffsetHeight        },        _getMainArticle : function () {            var e = null;            var f = this._getAllArticle();            f.sort(function (h, g) {                return g.weight - h.weight            });            e = f[0];            if (f.length >= 2 && f[1].weight > 400 && (f[0].weight - f[1].weight) * 100 / f[0].weight < 15) {                if (f[1]._texts.length > f[0]._texts.length) {                    e = f[1]                }            }            if (e.weight < 400) {                e = null            }            return e        }    };    return c});MyCollectionDefine("Article/Article", function (b, a) {    var c = function (d) {        this.elem = d;        this.offset = commonTool.findPos(d);        this._texts = this._getAllTexts(d, ArticleConfig.clipperArticleConfig.findMaxDepth);        this.weight = this._getWeight()    };    c.prototype = {        _getAllTexts : function (g, e) {            var d = [];            if (e > 0) {                var i = g.firstChild;                while (i) {                    if (this._checkVisibility(i)) {                        if (i.nodeType == Node.TEXT_NODE && this._checkLength(i)) {                            var f = i.parentNode || {},                            h = f.parentNode || {};                            if (!(this._checkMinorContent(f) || this._checkMinorContent(h)) && commonTool.trim(i.nodeValue)) {                                d.push(i)                            }                        } else {                            if (i.nodeType == Node.ELEMENT_NODE && this._checkTagName(i)) {                                d = d.concat(this._getAllTexts(i, e - 1))                            }                        }                    }                    i = i.nextSibling                }            }            return d        },        _checkVisibility : function (d) {            if (!d) {                return false            }            var e = commonTool.getComputedStyle(d, null, null);            if (!e) {                return true            }            return !(commonTool.css(e, "visibility") == "hidden" || commonTool.css(e, "display") == "none" || parseInt(commonTool.css(e, "height")) <= 0 || parseInt(commonTool.css(e, "width")) <= 0)        },        calcStructureWeight : function () {            var j = 0;            for (var h = 0, d = this._texts.length; h < d; h++) {                var i = this._texts[h],                f = commonTool.trim(i.nodeValue).length,                g = 1;                if (f < 20) {                    continue                }                for (var e = i.parentNode; e && e != this.elem; e = e.parentNode) {                    g -= 0.1                }                j += Math.pow(g * f, 1.25)            }            return j        },        calcContentWeight : function () {            var d = 1;            for (var e = this.elem; e; e = e.parentNode) {                if (e.id) {                    if (ArticleConfig.clipperArticleConfig.MAJOR_REGEXP.test(e.id)) {                        d += 0.4                    }                    if (ArticleConfig.clipperArticleConfig.MINOR_REGEXP.test(e.id)) {                        d -= 0.8                    }                }                if (e.className) {                    if (ArticleConfig.clipperArticleConfig.MAJOR_REGEXP.test(e.className)) {                        d += 0.4                    }                    if (ArticleConfig.clipperArticleConfig.MINOR_REGEXP.test(e.className)) {                        d -= 0.8                    }                }            }            return d        },        _getWeight : function () {            return this.calcStructureWeight() * this.calcContentWeight()        },        _checkTagName : function (d) {            return commonTool.indexOf(ArticleConfig.clipperArticleConfig.IGNORE_TAGS, d.tagName) == -1        },        _checkLength : function (d) {            return Boolean(ArticleConfig.clipperArticleConfig.BLANK_REGEXP.test(d.nodeValue))        },        _checkMinorContent : function (d) {            return Boolean(ArticleConfig.clipperArticleConfig.TINY_REGEXP.test(d.id + " " + d.className))        }    };    return c});
0 0
原创粉丝点击