python 白云黄鹤十大

来源:互联网 发布:淘宝网汉服女装 编辑:程序博客网 时间:2024/04/30 00:45

一、效果

能捕获按键的输入,无需按enter确认。



二、源码

# -*- coding:utf-8 -*-import selectimport sysimport timeimport osimport termiosimport reimport urllibimport urllib2import codecsimport chardetimport cookielib# 捕获按键def kbhit():fd = sys.stdin.fileno()r = select.select([sys.stdin],[],[],0.01)rcode = ''if len(r[0]) >0:rcode  = sys.stdin.read(1)return rcodedef key_init():fd = sys.stdin.fileno()old_settings = termios.tcgetattr(fd)new_settings = old_settings#new_settings[3] = new_settings[3] & ~termios.ISIGnew_settings[3] = new_settings[3] & ~termios.ICANONnew_settings[3] = new_settings[3] & ~termios.ECHONL#print 'old setting %s'%(repr(old_settings))termios.tcsetattr(fd,termios.TCSAFLUSH,new_settings)class BaiYunHuangHe:def __init__(self):self.log_url = ur'http://bbs.whnet.edu.cn/cgi-bin/bbslogin'self.top10_url = ur'http://bbs.whnet.edu.cn/xml/posttop10.xml'self.base_url  = u'http://bbs.whnet.edu.cn/cgi-bin/bbsnewtcon'self.cookie = cookielib.CookieJar() # cookieself.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cookie))self.postdata = urllib.urlencode({'id':'Garfield', 'pw':'ycl19910110'}) # 格式化表单数据self.coding_dict = {}self.info = []def login(self):request = urllib2.Request(url = self.log_url, data = self.postdata  )response = self.opener.open(request)log_page = response.read()self.coding_dict = chardet.detect(log_page)uni_log_page = log_page.decode(self.coding_dict['encoding'])def get_top10_url(self, page):pattern = ur'<board>(.*?)</board>.*?' + \ur'<title>(.*?)</title>.*?' + \ur'<reply_count>(.*?)</reply_count>.*?' + \ur'<file>(.*?)</file>'pattern_regx = re.compile(pattern, re.S)ret = pattern_regx.findall(page)info = []for item in ret:tmp_url = self.base_url + u'?board=%s&file=%s' % (item[0], item[3])info.append([item[0], item[1], item[2], tmp_url, item[3]])return infodef get_detail_one_page(self, item):headers = { u'Host':u'bbs.whnet.edu.cn',u'User-Agent' : u'Mozilla/5.0 (Windows NT 5.1; rv:38.0) Gecko/20100101 Firefox/38.0'}postdata = urllib.urlencode({u'board':item[0], u'file':item[4]})# proxy_handler = urllib2.ProxyHandler({"http" : 'http://111.12.13.170:55336'})proxy_handler = urllib2.ProxyHandler({}) # 爬的次数多了会被禁止访问,可用代理的方式解决opener = urllib2.build_opener(proxy_handler)request = urllib2.Request(item[3], headers = headers, data = postdata)response = opener.open(request)detail_page = response.read()uni_detail_page = detail_page.decode(u'gb2312','ignore') # chardet得到的字符编码不一定可靠,ignore 忽略不能转换的0xxreturn uni_detail_pagedef display_foot(self, index_floor, all_floor):print u'*****第%d楼(共%d楼)*****' % (index_floor, all_floor)print '''【'n' / 'm'】上下楼切换;【q】 返回十大'''def get_detail_info(self, item):result = []uni_detail_page = self.get_detail_one_page(item)next_page_pattern = u'\[<a href="bbsnewtcon?(.*?)".*?</a>\]'next_url = re.findall(next_page_pattern, uni_detail_page, re.S)all_url = []for i in next_url:all_url.append(self.base_url + i)if len(all_url) > 0:del all_url[0]pattern = ur'<textarea.*?>(.*?)</textarea>'pattern_regx = re.compile(pattern, re.S)ret = pattern_regx.findall(uni_detail_page)result.extend(ret)if len(result) == 0:return Nonefor i in all_url:item[3] = iuni_detail_page = self.get_detail_one_page(item)pattern = ur'<textarea.*?>(.*?)</textarea>'pattern_regx = re.compile(pattern, re.S)ret = pattern_regx.findall(uni_detail_page)result.extend(ret)#result 保存了每一楼的信息index_floor = 0print result[index_floor]self.display_foot(index_floor + 1, len(result))while True:c = kbhit()if len(c) != 0:if c == u'n':index_floor += 1if index_floor >= len(result):breakprint result[index_floor].strip()self.display_foot(index_floor + 1, len(result))elif c == u'm':index_floor -= 1if index_floor < 0:index_floor = 0print result[index_floor].strip()self.display_foot(index_floor + 1, len(result))elif c == u'q':self.handler_top10()breakelse:continuedef handler_top10(self):while True:order = 1for item in self.info:print order, item[1], item[2], item[0]order += 1try:print ''' 输入序号(【q】退出): '''order = raw_input()except (ValueError, TypeError), e:print econtinueif u'1' <= order <= u'9' or cmp(order, u'10') == 0:self.get_detail_info(self.info[int(order) - 1])elif cmp(order, u'q') == 0:breakdef get_top10_page(self):response = self.opener.open(self.top10_url)top10_page = response.read()uni_top10_page = top10_page.decode(self.coding_dict['encoding'])self.info = self.get_top10_url(uni_top10_page)self.handler_top10()def write_to_file(self, filename, uni_str):f = codecs.open(filename, u'w', u'utf-8')f.write(uni_str)f.close()def start(self):self.login()self.get_top10_page()if __name__ == u'__main__':key_init()bai_yun = BaiYunHuangHe()bai_yun.start()


0 0
原创粉丝点击