python 爬直播吧NBA录像

来源:互联网 发布:郭德纲人品知乎 编辑:程序博客网 时间:2024/04/30 21:33

一、运行结果





二、源码

# -*- coding:utf-8 -*-import urllibimport urllib2import reimport osimport codecsclass NBA(object):def __init__(self, team = ur'(热火vs马刺)'):self.base_url = ur'http://www.zhibo8.cc'self.team = teamdef get_game_main_url(self, unicode_page, team = ur'(热火vs马刺)'):pattern = ur'<div class="box">.*?' + team + ur'.*?href="(.*?)"'pattern_regx = re.compile(pattern)ret = pattern_regx.findall(unicode_page, re.S)result = []for item in ret:result.append([item[0], self.base_url + item[1]])return resultdef get_section_url(self, luxiang_url):response = urllib2.urlopen(luxiang_url)the_page = response.read().decode(ur'utf-8')#  find titlepattern = ur'<title>(.*?)全场录像.*?<span>(.*?) .*?</span>'pattern_regx = re.compile(pattern, re.S)title = pattern_regx.search(the_page)# find videopattern = ur' <strong>(.*?)</strong>.*?' + \ur'观看视频:<a href="(.*?)".*?' + \ur'观看视频:<a href="(.*?)".*?' + \ur'观看视频:<a href="(.*?)".*?' + \ur'观看视频:<a href="(.*?)".*?' + \ur'加时赛:<a href="(.*?)".*?'pattern_regx = re.compile(pattern, re.S)ret = pattern_regx.findall(the_page)if title is not None:print title.group(1)save_file = codecs.open(self.team + u'/' + title.group(2)[:5] + title.group(1).strip(), ur'w', ur'utf-8')for item in ret:writ_str = [item[0]]writ_str.append(ur'第1节 ' + item[1])writ_str.append(ur'第2节 ' + item[2])writ_str.append(ur'第3节 ' + item[3])writ_str.append(ur'第4节 ' + item[4])writ_str.append(ur'加时赛 ' + item[5] + '\r\n\r\n')save_file.write(u'\r\n'.join(writ_str))save_file.close()def nba_make_dir(self):is_exit = os.path.exists(self.team)if not is_exit:os.mkdir(self.team)print self.team + ur' 目录创建成功'else:print self.team + ur' 目录已经存在'def start(self):self.nba_make_dir()team = [u'(' + self.team + u')']index = team[0].find(ur'vs')team_guest = ur'(' + team[0][index + 2:len(team[0]) - 1] + ur'vs' + team[0][1:index] + ur')'team.append(team_guest)main_url = self.base_url + ur'/nba/luxiang.htm'response = urllib2.urlopen(main_url)main_page = response.read()unicode_page = main_page.decode('utf-8')luxiang_url = []for item in team:luxiang_url.extend(self.get_game_main_url(unicode_page, item))for item in luxiang_url:self.get_section_url(item[1])nba = NBA(ur'热火vs马刺')nba.start()


0 0