csdn文章分类修改脚本

来源:互联网 发布:知乎 中美军演 编辑:程序博客网 时间:2024/05/23 01:15

以前写的文章分类太多,太乱了,所以决定来一次清理,把一些文章类别给替换掉。和LX同学讨论了一个下午,终于有一个方案了,搞了一晚上才弄好。

我是混合了python和js代码一起做的。js可以在浏览器的 控制窗下运行,不用登陆了,所以很方便。python我用的比较习惯,所以处理数据比较方便,而且不会丢失。


步骤如下:

step 1:先把文章列表抓下来,知道有哪些文章。markdown编写的文章不能处理!都进eid了!

import urllib.requestimport reids = {} #保存所有的文章idfor i in range(30):#按15篇文章一页算自己有多少页url = "http://blog.csdn.net/firenet1/article/list/"+str(i+1)try:data = urllib.request.urlopen(url).read()except :continuedata = data.decode('UTF-8')patterm = "/firenet1/article/details/[0-9]{5,10}"patterm = re.compile(patterm)data = patterm.findall(data,re.S|re.M|re.I)digit = re.compile(r"[0-9]{5,10}",re.S|re.M|re.I)print(len(data))for d in data:# print(d)d = digit.search(d)# print(d)# print(d.group())ids[d.group()] = 1print(len(ids))file = open("js_array.txt","w")out = "var id_array = new Array("for i in ids.keys():out+="\""+i+"\",\n"out+=");"file.write(out)  file.close()'''输出文件保存成js数组var id_array = new Array("77187506","77073144","77046721","76766916","76642319","76195994")'''
step 2: 打开一个csdn博客编辑页,任何一页都行。F12进入控制台,在控制台里执行以下代码,这一步就能获得大部分文章的id,tag,类别

var id_array = new Array("77187506", //这个就是上一步得到的文章id数组"77073144","77046721",);#抓取文章标签和类别var time_out_th = 500;var i;var a;var id;var out;var eid; //执行不成功的文章id记录下来,可以自己打印出来看function getdata(){  //调用函数入口,通过timeout设置,每次抓取一个文章才进行下一个文章抓取     i = 0;     out = new Array();     id = id_array[i];     eid = new Array();     a = window.open(id,"_blank"); //打开新的编辑页面     setTimeout(doing,time_out_th);}function doing(){    if(i == id_array.length) {            console.log("finish");            return i;    }    try{        if(a != 0 && a.document.readyState == "complete"){ //判断新打开的页面是不是加载完毕了            let cla = a.document.getElementById("txtTag").value;            let lable = a.document.getElementById("d_tag2").innerHTML;            lable = lable.replace("\n","");            out.push(new Array(id,cla,lable));            a.close();            console.log("ok: "+i)            i++;            if(i == id_array.length) {                console.log("finish");                return i;            }            id = id_array[i];            a = window.open(id,"_blank");        }    }    catch(err){        eid.push(id);        console.log("err: "+i);        i++;        a.close();        if(i == id_array.length) {            console.log("finish");            return i;        }        id = id_array[i];        a = window.open(id,"_blank");    }    setTimeout(doing,time_out_th);}function output(){ //输出文本函数    let res = "";    for(let i = 0;i < out.length;i++){        res += out[i][0]+"###,###"+out[i][1]+"###,###"+out[i][2]+"\n";    }    console.log(res);}/*输出数据如下:复制黏贴到本地文本后用于后面的步骤47311009###,###多校联合训练赛###,###<span title="单击删除该标签">hdu 5328</span><span title="单击删除该标签">hdu</span>*/
step 3:打开自己的类别管理,F12进入控制台,执行以下代码

##获取文章类别总数var x = document.getElementsByClassName("tdleft")var y = ""for(var i = 1;i < x.length; i++){    y += (x[i].firstChild.innerHTML)+"###,###\n";}console.log(y)/**输出如下:每一行就是一个类别  ###,###以及后面的部分就是我想把这个类别换成其他类别,分割开,可以没有动态规划###,###ACM-ICPC编程题,动态规划数据结构###,###ACM-ICPC编程题,数据结构字符串###,###ACM-ICPC编程题,字符串模拟###,###ACM-ICPC编程题,模拟*/

step 4:把所有文章的类别和tag都换成新的,我的tag是使用原来的tag,如果不够5个,会把原先的类别变成tag。python代码
import urllib.requestimport repatterm = "<span title=\"单击删除该标签\">[\w]+</span>"patterm = re.compile(patterm)articles = []#读取原来文章的tag和分类,并且处理成数组with open("classify_lable.txt","r",encoding="utf-8") as f:for i in f:i = i.split("###,###")i[2] = patterm.findall(i[2])k = ""j = 0while j < len(i[2]):i[2][j] = i[2][j].replace("<span title=\"单击删除该标签\">","")i[2][j] = i[2][j].replace("</span>","")k+=" "+i[2][j]j+=1articles.append(i)print("articles: "+str(len(articles)))#读取原来的分类列表,并且映射成字典,方便下一步origin = {}now = {}with open("classfyMap.txt","r",encoding="utf-8") as f:for i in f:i = i.replace("\n","")i = i.split("###,###")if(len(i[1]) == 0):continuei[1] = i[1].split(",")origin[i[0]] = i[1]for j in i[1]:now[j] = 1print("old category: %d"%(len(origin)))print("now category: %d"%(len(now)))# for i in now.keys():# print(i)#对于每一篇文章,更新tag以及把旧的类别换成新的,新的类别以'##'开头#这里用字典可以去重,tag和类别都不会重复new_article = []nolable = 0nocategory = 0for i in articles:lable_c = {}new_c = {}for j in i[2]:if(len(j) > 0):lable_c[j] = 1for j in i[1].split(","):if(len(j) > 0):lable_c[j] = 1if j in origin:for k in origin[j]:new_c[k] = 1category = []lable = []for j in lable_c.keys():lable.append(j)if(len(new_c) == 0):for j in lable:if(j in origin):new_c[j] = 1for j in new_c.keys():category.append(j)if(len(lable) == 0):nolable += 1if(len(category) == 0):nocategory += 1new_article.append([i[0],lable,category])print("finale set: %d nolable: %d nocategory: %d"%(len(new_article),nolable,nocategory))# for i in new_article:# print(i)#输出成js数组,用于下一步操作js_arry = "var arti = ["m = 0for i in new_article:lab = ""cat = ""for j in i[1]:lab += ("<span title=\'单击删除该标签\'>%s</span>"%(j))k = 0while k < len(i[2]):if k > 0:cat +=','cat += "##"+i[2][k]k += 1if (m > 0):js_arry += ",\n"js_arry += '["%s","%s","%s"]'%(i[0],lab,cat)m+=1js_arry +="];\n"print(js_arry)
step 5:这一步就把js代码放到控制台运行了,控制台还是要编辑页面的,这样就没有域的问题。因为url我没处理哦!执行比较久300+文章呢
#更新文章类别和标签var arti = [["77187506","<span title='单击删除该标签'>布隆过滤器</span><span title='单击删除该标签'>我只想找工作</span>","##我只想找工作"],["77073144","<span title='单击删除该标签'>hyperloglog</span><span title='单击删除该标签'>基数计数</span><span title='单击删除该标签'>我只想找工作</span>","##我只想找工作"],["77046721","<span title='单击删除该标签'>信号量</span><span title='单击删除该标签'>临界区</span><span title='单击删除该标签'>自旋锁</span><span title='单击删除该标签'>操作系统</span><span title='单击删除该标签'>我只想找工作</span>","##操作系统,##我只想找工作"],47438411","<span title='单击删除该标签'>2015多校联合训练赛</span><span title='单击删除该标签'>模拟</span>","##ACM-ICPC编程题,##模拟"],];var time_out_th = 500;var i;var a;var id;var step = 0;var eid = new Array();function getdata(){     i = 0;     id = arti[i];     step = 0;     eid = new Array();     a = window.open(id[0],"_blank");     setTimeout(doing,time_out_th);}function doing(){    if(i == arti.length) {        console.log("finish");        return i;    }    try{//加载完成后修改内容并点击保存,延时200毫秒再判断是否保存好了        if(step == 0 && a != 0 && a.document.readyState == "complete"){            a.document.getElementById("txtTag").value = id[2];            a.document.getElementById("d_tag2").innerHTML = id[1];            console.log("complete: "+i)            step += 1            a.document.getElementById("btnDraft").click()        }    }    catch(err){        console.log(err);        console.log("err: "+i);        eid.push(id[0]);        i++;        a.close();        if(i == arti.length) {            console.log("finish");            return i;        }        step = 0        id = arti[i];        a = window.open(id[0],"_blank");    }    if(step == 1 && !a.saving){ //保存完毕,打开下一个网页        a.close();        console.log("ok: "+i)        i++;        if(i == arti.length) {            console.log("finish");            return i;        }        id = arti[i];        a = window.open(id[0],"_blank");        step = 0;    }    setTimeout(doing,time_out_th);}
step 6:打开文章分类,把只有0篇的分类删除掉。alert手动确认,不然好像因为缓存的原因,删不掉后面的
#删除文章数为0的类别var a = document.getElementsByClassName("red");for(var i = 0;i < a.length;i++){    var b = a[i].href;    b = b.substring(b.length-7,b.length);    var c = a[i].text;    if(c == "0"){        console.log(b+" "+c);         $.get("?t=" + "del", { id: b, r: csdn.random() }, function (ret) {         alert(1);        });        //ory?t=del&id=1380215&r=83505    }}

源代码参考:https://gitlab.com/linyuwang/csdn-classfy-change/tree/master

step 7:按文章数对类型进行排序。在文章类别管理的console执行

#把标签按文章数排序  这段代码有问题  总是有些没有移动的难道是缓存的问题?#反复执行几次才行var a = document.getElementsByClassName("red");var ids = new Array();var compare = function(x,y){    if(x[1] > y[1]) return -1;    if(x[1] == y[1]) return 0;    return 1;}for(var i = 0;i < a.length;i++){    var b = a[i].href;    b = b.substring(b.length-7,b.length);    var c = a[i].text;    ids.push(new Array(b,Number(c),i));//        console.log(b+" "+c);//         $.get("?t=" + "del", { id: b, r: csdn.random() }, function (ret) {}ids.sort(compare);console.log(ids);var i = 0function doing(){    while(i != ids.length && ids[i][2] <= i){        i++;    }    if(i == ids.length) return "finish";    if(ids[i][2] > i){        ids[i][2]--;        let t = i;//        $.get("?t=" + "up", { id: ids[i][0], r: csdn.random() },function () {//            alert(t+","+ids[t][2]+","+ids[t][1]);});        doExec("", ids[i][0], "up");    }    console.log(i+","+ids[i][2]+","+ids[i][1]);    setTimeout(doing,500);}doing();
















原创粉丝点击