python爬虫实战:抓取猫眼电影TOP100存放到MongoDB中

来源:互联网 发布:如何面试java应聘者 编辑:程序博客网 时间:2024/06/04 19:02

猫眼网页非常简单,就不做网页分析了,直接上代码


main.py

url='http://maoyan.com/board/4?offset={}0'from get_info import get_infoif __name__=='__main__':    for page in range(0,10,1):        get_info(url,page)

get_info.py

def get_html(url,page):    import requests    wb_data = requests.get(url.format(page))    from bs4 import BeautifulSoup    html= BeautifulSoup(wb_data.text, 'lxml')    return htmldef get_a_rank(html,num):    rank=html.select('div.main > dl > dd:nth-of-type({}) > i'.format(num))    return rank[0].textdef get_a_title(html,num):    name=html.select('div.main > dl > dd:nth-of-type({}) > div > div > div.movie-item-info > p.name > a'.format(num))    return name[0].textdef get_a_star(html,num):    star=html.select('div.main > dl > dd:nth-of-type({}) > div > div > div.movie-item-info > p.star'.format(num))    return star[0].text.split(':')[-1]def get_a_releasetime(html,num):    releasetime=html.select('div.main > dl > dd:nth-of-type({}) > div > div > div.movie-item-info > p.releasetime'.format(num))    return releasetime[0].text.split(':')[-1]def get_a_score(html,num):    score1=html.select('div.main > dl > dd:nth-of-type({}) > div > div > div.movie-item-number.score-num > p > i.integer'.format(num))    score2=html.select('div.main > dl > dd:nth-of-type({}) > div > div > div.movie-item-number.score-num > p > i.fraction'.format(num))    return score1[0].text+score2[0].textdef get_a_img_src(html,num):    img=html.select('div.main > dl > dd:nth-of-type({}) > a > img.board-img'.format(num))    return img[0].get('data-src')       #此处图片源与网页显示不一致from pymongo_database import movie_item_infodef get_info(url,page):    html=get_html(url,page)    for num in range(1,11,1):        rank=get_a_rank(html,num)        name=get_a_title(html,num)        star=get_a_star(html,num)        releasetime=get_a_releasetime(html,num)        score=get_a_score(html,num)        img_src=get_a_img_src(html,num)        movie_item_info.insert_one({'rank':rank,'name':name,                                    'star':star,'releasetime':releasetime,                                    'score':score,'img':img_src})


阅读全文
0 0
原创粉丝点击