程序开发常用轮子

来源:互联网 发布:python 灰帽子下载 编辑:程序博客网 时间:2024/05/24 04:23

【python】新闻页正文抽取

# -*- coding: utf-8 -*-'''新闻页正文抽取v1.0'''import requests,multiprocessing,re,sysimport MySQLdb as mdbreload(sys)sys.setdefaultencoding('utf-8')DBUG   = 0reBODY =re.compile( r'<body.*?>([\s\S]*?)<\/body>', re.I)reBODY2 =re.compile( r'<script.*?>([\s\S]*?)<\/script>', re.I)reBODY3 = re.compile(r'<style.*?>([\s\S]*?)</style>',re.I)reBODY4 = re.compile(r'{[\s\S]*}',re.I)reCOMM = r'<!--.*?-->'def search(req,html):    text = re.search(req,html)    if text:        data = text.group(1)    else:        data = 'no'    return dataclass Extractor():    def __init__(self, url = "", blockSize=3, timeout=5, image=False):        self.url       = url        self.blockSize = blockSize        self.timeout   = timeout        self.saveImage = image        self.rawPage   = ""        self.ctexts    = []        self.cblocks   = []    def getRawPage(self):        host = search('^([^/]*?)/',re.sub(r'(https|http)://','',self.url))        headers = {            "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",            "Accept-Encoding":"gzip, deflate, sdch",            "Accept-Language":"zh-CN,zh;q=0.8,en;q=0.6",            "Cache-Control":"no-cache",            "Connection":"keep-alive",            "Host":host,            "Pragma":"no-cache",            "Upgrade-Insecure-Requests":"1",            "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36",        }        proxyHost = "proxy.abuyun.com"        proxyPort = "9010"        # 代理隧道验证信息        proxyUser = "天王盖地虎"        proxyPass = "裤衩遮不住"        proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {          "host" : proxyHost,          "port" : proxyPort,          "user" : proxyUser,          "pass" : proxyPass,        }        proxies = {            "http"  : proxyMeta,            "https" : proxyMeta,        }        try:            f = requests.get(self.url,headers=headers,timeout=10)        except Exception as e:            raise e        code = f.status_code        content = f.content        '''修改python2这个王八蛋使用request对网页编码误识别为iso-8859-1的BUG'''        if f.encoding.lower() != 'utf-8':            charset = re.compile(r'content="text/html;.?charset=(.*?)"').findall(content)            coding = f.encoding.lower()            print coding, f.headers['content-type']            try:                if len(charset) > 0 and charset[0].lower() != coding:                    content = content.decode('gbk').encode('utf-8')                elif coding == 'gbk' or coding == 'gb2312' or coding == 'iso-8859-1':                    content = content.decode('gbk').encode('utf-8')            except:                pass        self.title = search("<title>([\s\S]*?)</title>",content).strip()        return code,content    def processTags(self):        self.body = re.sub(reBODY, "", self.body)        self.body = re.sub(reBODY2, "", self.body)        self.body = re.sub(reBODY3,"", self.body)        self.body = re.sub(reBODY4,"", self.body)        self.body = re.sub(reCOMM, "", self.body)        self.body = re.sub(r'<(?!p|/p)[^<>]*?>|下一篇.*','',self.body)        self.body = re.sub(r'<p[^>]*?>','<p>',self.body)        #self.body = re.sub(reTAG, "", self.body)        self.body = re.sub(r'[\t\r\f\v]','',self.body)        '''抽取图片'''        self.img = search(r'<img[\s\S]*?src=[\'|"]([\s\S]*?)[\'|"][\s\S]*?>',self.body)        if 'http' not in self.img:            self.img = '<img src="%s%s" >' % (search('^([^/]*?)/',re.sub(r'(https|http)://','',self.url)),self.img)    def processBlocks(self):        self.ctexts   = self.body.split("\n")        self.textLens = [len(text) for text in self.ctexts]        self.cblocks  = [0]*(len(self.ctexts) - self.blockSize - 1)        lines = len(self.ctexts)        for i in range(self.blockSize):            self.cblocks = list(map(lambda x,y: x+y, self.textLens[i : lines-1-self.blockSize+i], self.cblocks))        maxTextLen = max(self.cblocks)        if DBUG: print(maxTextLen)        self.start = self.end = self.cblocks.index(maxTextLen)        while self.start > 0 and self.cblocks[self.start] > min(self.textLens):            self.start -= 1        while self.end < lines - self.blockSize and self.cblocks[self.end] > min(self.textLens):            self.end += 1        content = "".join(self.ctexts[self.start:self.end])        return content    def getContext(self):        code, self.rawPage = self.getRawPage()        self.body = re.findall(reBODY, self.rawPage)[0]        if DBUG: print(code, self.rawPage)        self.processTags()        return self.title,self.processBlocks()def getZwIndex(url):    # if __name__ == '__main__':    ext = Extractor(url=url,blockSize=1, image=False)    return ext.getContext() # pool = multiprocessing.Pool(processes=3)# for url in url_list:#     pool.apply_async(getIndex, (url, ))# pool.close()# pool.join()

【github】百宝箱

https://github.com/a631381602

【python】MD5生成

import hashlibm2 = hashlib.md5()m2.update(src)print m2.hexdigest()

【python】CSV读写

@@@@ 写入并生成csv文件 @@@@# coding: utf-8import csvcsvfile = file('csv_test.csv', 'wb')writer = csv.writer(csvfile)writer.writerow(['姓名', '年龄', '电话'])data = [    ('小河', '25', '1234567'),    ('小芳', '18', '789456')]writer.writerows(data)csvfile.close()@@@@ 读取csv文件 @@@@# coding: utf-8import csvcsvfile = file('csv_test.csv', 'rb')reader = csv.reader(csvfile)for line in reader:    print linecsvfile.close() 

【python】shell批量添加文件后缀名

for i in *; do mv "$i" "$i.txt"; done

【python】提取中文

#coding: utf-8import sys,rereload(sys)sys.setdefaultencoding('utf8')s =   """    en: Regular expression is a powerful tool for manipulating text.     zh: 汉语是世界上最优美的语言,正则表达式是一个很有用的工具    jp: 正規表現は非常に役に立つツールテキストを操作することです。     jp-char: あアいイうウえエおオ     kr:정규 표현식은 매우 유용한 도구 텍스트를 조작하는 것입니다.     """#unicode chineses = unicode(s)re_words = re.compile(u"[\u4e00-\u9fa5]+")m =  re_words.search(s,0)print "unicode 中文"print "--------"print mprint m.group()res = re.findall(re_words, s)  # 查询出所有的匹配字符串if res:    print "There are %d parts:\n" % len(res)     for r in res:         print rprint "--------\n"

【python】Mysql读写

@@@@ 写入Mysql @@@@import sys,time,os,smtplib,MySQLdbtoday = time.strftime('%Y-%m-%d',time.localtime(time.time()))db = MySQLdb.connect("localhost","root","”,”seo_data",charset="utf8")cursor = db.cursor()sql = '''INSERT INTO bd_m_uv VALUES ("%s",%s)''' % (today,','.join(sql_bd_m_uv))try:    cursor.execute(sql)    db.commit()    print 'done'except:    db.rollback()@@@@ 读取Mysql @@@@import csv,re,sysimport MySQLdb as mdbreload(sys)sys.setdefaultencoding('utf8')con = mdb.connect('localhost','root','','url_push',charset='utf8');with con:    cur = con.cursor()    cur.execute("select * from hx")    numrows = int(cur.rowcount)    for i in range(numrows):        row = cur.fetchone()        print row[0]

【python】判断当前字符串是否全部为中文

#coding:utf-8import sysreload(sys)sys.setdefaultencoding('utf8')def check_contain_chinese(check_str):    n = 0    m = 0    for ch in check_str.decode('utf-8'):        if u'\u4e00' <= ch <= u'\u9fff':            n += 1        else:            m += 1    if m == 0:        return 1    else:        return 0

【python】计算正文字数

'''计算正文字数'''text = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,::。?、~@#¥%……&*()“”《》]+".decode("utf8"), "".decode("utf8"),newcontent) text2 = re.sub('<[^>]*?>','',text)  words_number = len(text2)

【python】html块去除杂乱标签

# text为正文变量a = re.sub(r'<script.*?>([\s\S]*?)<\/script>','',text)b = re.sub(r'<style.*?>([\s\S]*?)</style>','',a)c = re.sub(r'{[\s\S]*}','',b)d = re.sub(r'<(?!p|img|/p)[^<>]*?>','',c).strip()   #将除p和img之外的标签清空,且去除正文开头结尾的换行,并把单引号换成双引号e = re.sub(r'<p[^>]*?>','<p>',d)     #格式化p标签

【python】将反斜杠u类型(\uXXXX)的字符串,转换为对应的unicode的字符串

slashUStr = "\\u0063\\u0072\\u0069\\u0066\\u0061\\u006E\\u0020\\u5728" decodedUniChars = slashUStr.decode("unicode-escape")print "decodedUniChars=",decodedUniChars

【python】json、dict转化

import simplejson# JSON转化为字典json_2_dict = simplejson.loads(user)print json_2_dict#字典转化为JSON字符串dict_2_jsonstr = simplejson.dumps(json_2_dict)print dict_2_jsonstr 

【python】通过UA识别wap还是pc来访

def getUA(ua):    reg_b = re.compile(r"(android|bb\\d+|meego).+mobile|avantgo|bada\\/|blackberry|blazer|compal|elaine|fennec|hiptop|iemobile|ip(hone|od)|iris|kindle|lge |maemo|midp|mmp|mobile.+firefox|netfront|opera m(ob|in)i|palm( os)?|phone|p(ixi|re)\\/|plucker|pocket|psp|series(4|6)0|symbian|treo|up\\.(browser|link)|vodafone|wap|windows ce|xda|xiino", re.I|re.M)    reg_v = re.compile(r"1207|6310|6590|3gso|4thp|50[1-6]i|770s|802s|a wa|abac|ac(er|oo|s\\-)|ai(ko|rn)|al(av|ca|co)|amoi|an(ex|ny|yw)|aptu|ar(ch|go)|as(te|us)|attw|au(di|\\-m|r |s )|avan|be(ck|ll|nq)|bi(lb|rd)|bl(ac|az)|br(e|v)w|bumb|bw\\-(n|u)|c55\\/|capi|ccwa|cdm\\-|cell|chtm|cldc|cmd\\-|co(mp|nd)|craw|da(it|ll|ng)|dbte|dc\\-s|devi|dica|dmob|do(c|p)o|ds(12|\\-d)|el(49|ai)|em(l2|ul)|er(ic|k0)|esl8|ez([4-7]0|os|wa|ze)|fetc|fly(\\-|_)|g1 u|g560|gene|gf\\-5|g\\-mo|go(\\.w|od)|gr(ad|un)|haie|hcit|hd\\-(m|p|t)|hei\\-|hi(pt|ta)|hp( i|ip)|hs\\-c|ht(c(\\-| |_|a|g|p|s|t)|tp)|hu(aw|tc)|i\\-(20|go|ma)|i230|iac( |\\-|\\/)|ibro|idea|ig01|ikom|im1k|inno|ipaq|iris|ja(t|v)a|jbro|jemu|jigs|kddi|keji|kgt( |\\/)|klon|kpt |kwc\\-|kyo(c|k)|le(no|xi)|lg( g|\\/(k|l|u)|50|54|\\-[a-w])|libw|lynx|m1\\-w|m3ga|m50\\/|ma(te|ui|xo)|mc(01|21|ca)|m\\-cr|me(rc|ri)|mi(o8|oa|ts)|mmef|mo(01|02|bi|de|do|t(\\-| |o|v)|zz)|mt(50|p1|v )|mwbp|mywa|n10[0-2]|n20[2-3]|n30(0|2)|n50(0|2|5)|n7(0(0|1)|10)|ne((c|m)\\-|on|tf|wf|wg|wt)|nok(6|i)|nzph|o2im|op(ti|wv)|oran|owg1|p800|pan(a|d|t)|pdxg|pg(13|\\-([1-8]|c))|phil|pire|pl(ay|uc)|pn\\-2|po(ck|rt|se)|prox|psio|pt\\-g|qa\\-a|qc(07|12|21|32|60|\\-[2-7]|i\\-)|qtek|r380|r600|raks|rim9|ro(ve|zo)|s55\\/|sa(ge|ma|mm|ms|ny|va)|sc(01|h\\-|oo|p\\-)|sdk\\/|se(c(\\-|0|1)|47|mc|nd|ri)|sgh\\-|shar|sie(\\-|m)|sk\\-0|sl(45|id)|sm(al|ar|b3|it|t5)|so(ft|ny)|sp(01|h\\-|v\\-|v )|sy(01|mb)|t2(18|50)|t6(00|10|18)|ta(gt|lk)|tcl\\-|tdg\\-|tel(i|m)|tim\\-|t\\-mo|to(pl|sh)|ts(70|m\\-|m3|m5)|tx\\-9|up(\\.b|g1|si)|utst|v400|v750|veri|vi(rg|te)|vk(40|5[0-3]|\\-v)|vm40|voda|vulc|vx(52|53|60|61|70|80|81|83|85|98)|w3c(\\-| )|webc|whit|wi(g |nc|nw)|wmlb|wonu|x700|yas\\-|your|zeto|zte\\-", re.I|re.M)    b = reg_b.search(ua)    v = reg_v.search(ua[0:4])    if b or v:        return 'wap'    else:        return 'pc'

【linux】根据文件名杀死进程

ps auxf|grep 'zq2.py'|grep -v grep|awk '{print $2}'|xargs kill -9

【python】日期遍历

import datatime,timedef date_range(start, end, only_monday=False, input_format='%y%m%d', output_format='%y%m%d'):     '''如print date_range(140130, 140202)     输出['140130', '140131', '140201', '140202']     '''    start = str(start)    end = str(end)    start = datetime.datetime.strptime(start, input_format)    end = datetime.datetime.strptime(end, input_format)    one_day = datetime.timedelta(days=1)    range_ = []    d = start - one_day         while 1:        d = d + one_day                  if d > end:                        break                if only_monday and d.strftime('%w')!='1':                        continue        range_.append(datetime.datetime.strftime(d, output_format))        return range_

【nginx】针对PC来访返回404

 # 如果来访用户,不是移动和蜘蛛,则返回404状态    set $mobile_rewrite do_not_perform;    if ($http_user_agent ~* "(android|bb\d+|meego).+mobile|avantgo|bada\/|blackberry|blazer|compal|elaine|fennec|hiptop|iemobile|ip(hone|od)|iris|kindle|lge |maemo|midp|mmp|mobile.+firefox|netfront|opera m(ob|in)i|palm( os)?|phone|p(ixi|re)\/|plucker|pocket|psp|series(4|6)0|symbian|treo|up\.(browser|link)|vodafone|wap|windows ce|xda|xiino|spider|Spider|bot|Bot") {      set $mobile_rewrite perform;    }    if ($http_user_agent ~* "^(1207|6310|6590|3gso|4thp|50[1-6]i|770s|802s|a wa|abac|ac(er|oo|s\-)|ai(ko|rn)|al(av|ca|co)|amoi|an(ex|ny|yw)|aptu|ar(ch|go)|as(te|us)|attw|au(di|\-m|r |s )|avan|be(ck|ll|nq)|bi(lb|rd)|bl(ac|az)|br(e|v)w|bumb|bw\-(n|u)|c55\/|capi|ccwa|cdm\-|cell|chtm|cldc|cmd\-|co(mp|nd)|craw|da(it|ll|ng)|dbte|dc\-s|devi|dica|dmob|do(c|p)o|ds(12|\-d)|el(49|ai)|em(l2|ul)|er(ic|k0)|esl8|ez([4-7]0|os|wa|ze)|fetc|fly(\-|_)|g1 u|g560|gene|gf\-5|g\-mo|go(\.w|od)|gr(ad|un)|haie|hcit|hd\-(m|p|t)|hei\-|hi(pt|ta)|hp( i|ip)|hs\-c|ht(c(\-| |_|a|g|p|s|t)|tp)|hu(aw|tc)|i\-(20|go|ma)|i230|iac( |\-|\/)|ibro|idea|ig01|ikom|im1k|inno|ipaq|iris|ja(t|v)a|jbro|jemu|jigs|kddi|keji|kgt( |\/)|klon|kpt |kwc\-|kyo(c|k)|le(no|xi)|lg( g|\/(k|l|u)|50|54|\-[a-w])|libw|lynx|m1\-w|m3ga|m50\/|ma(te|ui|xo)|mc(01|21|ca)|m\-cr|me(rc|ri)|mi(o8|oa|ts)|mmef|mo(01|02|bi|de|do|t(\-| |o|v)|zz)|mt(50|p1|v )|mwbp|mywa|n10[0-2]|n20[2-3]|n30(0|2)|n50(0|2|5)|n7(0(0|1)|10)|ne((c|m)\-|on|tf|wf|wg|wt)|nok(6|i)|nzph|o2im|op(ti|wv)|oran|owg1|p800|pan(a|d|t)|pdxg|pg(13|\-([1-8]|c))|phil|pire|pl(ay|uc)|pn\-2|po(ck|rt|se)|prox|psio|pt\-g|qa\-a|qc(07|12|21|32|60|\-[2-7]|i\-)|qtek|r380|r600|raks|rim9|ro(ve|zo)|s55\/|sa(ge|ma|mm|ms|ny|va)|sc(01|h\-|oo|p\-)|sdk\/|se(c(\-|0|1)|47|mc|nd|ri)|sgh\-|shar|sie(\-|m)|sk\-0|sl(45|id)|sm(al|ar|b3|it|t5)|so(ft|ny)|sp(01|h\-|v\-|v )|sy(01|mb)|t2(18|50)|t6(00|10|18)|ta(gt|lk)|tcl\-|tdg\-|tel(i|m)|tim\-|t\-mo|to(pl|sh)|ts(70|m\-|m3|m5)|tx\-9|up(\.b|g1|si)|utst|v400|v750|veri|vi(rg|te)|vk(40|5[0-3]|\-v)|vm40|voda|vulc|vx(52|53|60|61|70|80|81|83|85|98)|w3c(\-| )|webc|whit|wi(g |nc|nw)|wmlb|wonu|x700|yas\-|your|zeto|zte\-)") {      set $mobile_rewrite perform;    }    if ($mobile_rewrite != perform) {        return 404;    }

大家有好的轮子可以给我留言~

0 0
原创粉丝点击