Python学习笔记——20170831

来源:互联网 发布:java web开发教程视频 编辑:程序博客网 时间:2024/06/03 23:00

同城旅游网 爬虫练习

  • 类库安装
    • pip install requests
    • pip install beautifulsoup4
  • 代码
import requestsfrom bs4 import BeautifulSoupimport os##def get_html(url):    """    获取html源码    :param url: 链接地址    :return: html源码    """    headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)"}    response = requests.get(url, headers=headers)    response.encoding = response.apparent_encoding    if response.status_code == 200:        return response.text    else:        print("网络访问出错")##def parse_html(html):    """    解析html源码    :param html: html代码    :return: 返回数据List    """    soup = BeautifulSoup(html, 'lxml')    items = soup.select("#tagList > ul > li > div.line-info > div")    infoList = []    for each in items:        each_soup = BeautifulSoup(str(each), 'lxml')        info = {            'type': each_soup.select('div.line-imgbox > span')[0].string,            'title': each_soup.select('p.line-title > b')[0].string,            'price': each_soup.select('div.line-pricebox > div > p')[0].em.next_sibling,        }        if len(each_soup.select('p.sat-num')) == 0:            info['satisfied'] = 'None'        else:            info['satisfied'] = each_soup.select('p.sat-num')[0].em.previous_sibling        infoList.append(info)    return infoList##def save_file(path, text):    """    文本存储    :param path: 存储路径    :param text: 文本内容    :return: None    """    file_name = path.split("/")[-1]    dir_path = path.strip(file_name)    if not os.path.exists(dir_path):        os.mkdir(dir_path)    with open(path, 'w', encoding='UTF-8') as file:        file.write(text)##if __name__ == '__main__':    url = 'https://www.ly.com/dujia/taiguo-lvyou/f394/'    html = get_html(url)    infoList = parse_html(html)    save_file('./lyinfo.json', str(infoList).replace('\'', '\"'))