抓取北京影讯的电影信息

来源:互联网 发布:汤恩伯 知乎 编辑:程序博客网 时间:2024/04/29 06:27
# -*- coding: utf-8 -*-

import urllib2
import os
import re

def mean_audience_score(FilmId):
arv = 0.0
sc_url = "http://movie.mtime.com/" + FilmId + "/"
sc_req = urllib2.Request(sc_url, headers={'User-Agent': "Magic Browser"})
sc_page = urllib2.urlopen(sc_req)
sc_strw = sc_page.read() # 读入 网站源码
sc_str = re.findall(r'<span class="db_point ml6">+\d+\.+\d+</span>', sc_strw) # 匹配评分的数字
if len(sc_str) == 0:
return arv
for tt in sc_str:
scsc = re.findall(r'\d+\.+\d', tt) # 取出每个人对电影的评分,取出观众的评分
arv = arv + float(scsc[0]) # 所有观众对电影的评分总和
return arv / len(sc_str)

#------------------------------------------------------------------------------------------------------------------------------------------------------------

url = 'http://theater.mtime.com/China_Beijing/' # 1.需抓取的网址
req = urllib2.Request(url, headers={
'User-Agent': "Magic Browser"}) # 2.urllib2.Request(url[, data][, headers][, origin_req_host][, unverifiable])
webpage = urllib2.urlopen(req) # urllib2.urlopen(url[, data][, timeout])
strw = webpage.read()
tg_start = strw.find('hotplaySvList = [') # str.find(str, beg=0, end=len(string))
if tg_start == -1:
print 'not find start tag'
os.exit()
tmp = strw[tg_start:-1]
tg_end = tmp.find(';')
if tg_end == -1:
print 'not find end tag'
os.exit()
tmp = tmp[len(' hotplaySvList = ['):tg_end]
tar_ls = tmp.split("},{") # str.split(str="", num=string.count(str)) # 3. 用},{切分字符串
dict_film = {} # 定义字典
for t0 in tar_ls:
ls_t = t0.split(',')
id = ls_t[0].split(':')[-1].strip() #
film = ls_t[-1].split('"')[-2].strip() #
dict_film[id] = film #
for t in dict_film:
print "id: " + t + " film: ", dict_film[t]
print 'ok total : ' + `len(dict_film)`

for t in dict_film:
score = mean_audience_score(t)
print "id: " + t + " film: " + dict_film[t] + " score: " + `score`


电影数据分析样题.docx

阅读全文
0 0
原创粉丝点击