抓取北京影讯的电影信息

来源：互联网发布：汤恩伯知乎编辑：程序博客网时间：2024/04/29 06:27

# -*- coding: utf-8 -*-

import urllib2

import os

import re

def mean_audience_score(FilmId):

arv = 0.0

sc_url = "http://movie.mtime.com/" + FilmId + "/"

sc_req = urllib2.Request(sc_url, headers={'User-Agent': "Magic Browser"})

sc_page = urllib2.urlopen(sc_req)

sc_strw = sc_page.read() # 读入 网站源码

sc_str = re.findall(r'<span class="db_point ml6">+\d+\.+\d+</span>', sc_strw) # 匹配评分的数字

if len(sc_str) == 0:

return arv

for tt in sc_str:

scsc = re.findall(r'\d+\.+\d', tt) # 取出每个人对电影的评分，取出观众的评分

arv = arv + float(scsc[0]) # 所有观众对电影的评分总和

return arv / len(sc_str)

#------------------------------------------------------------------------------------------------------------------------------------------------------------

url = 'http://theater.mtime.com/China_Beijing/' # 1.需抓取的网址

req = urllib2.Request(url, headers={

'User-Agent': "Magic Browser"}) # 2.urllib2.Request(url[, data][, headers][, origin_req_host][, unverifiable])

webpage = urllib2.urlopen(req) # urllib2.urlopen(url[, data][, timeout])

strw = webpage.read()

tg_start = strw.find('hotplaySvList = [') # str.find(str, beg=0, end=len(string))

if tg_start == -1:

print 'not find start tag'

os.exit()

tmp = strw[tg_start:-1]

tg_end = tmp.find(';')

if tg_end == -1:

print 'not find end tag'

os.exit()

tmp = tmp[len(' hotplaySvList = ['):tg_end]

tar_ls = tmp.split("},{") # str.split(str="", num=string.count(str)) # 3. 用},{切分字符串

dict_film = {} # 定义字典

for t0 in tar_ls:

ls_t = t0.split(',')

id = ls_t[0].split(':')[-1].strip() #

film = ls_t[-1].split('"')[-2].strip() #

dict_film[id] = film #

for t in dict_film:

print "id: " + t + " film: ", dict_film[t]

print 'ok total : ' + `len(dict_film)`

for t in dict_film:

score = mean_audience_score(t)

print "id: " + t + " film: " + dict_film[t] + " score: " + `score`

阅读全文

0 0