网络爬虫

来源：互联网发布：软件培训就业编辑：程序博客网时间：2024/06/05 10:39

1.安装python（2和3不兼容）
2.以管理员身份运行终端，下载beautifulsoup4，执行命令：pip install beautifulsoup4
3.下载request ：pip install requests
4.分析网页代码结构：
简单代码如下：

import requestsfrom bs4 import BeautifulSoupresp = requests.get("网址")soup = BeautifulSoup(resp.text,'html.parser')#分析网页，找到相应类和标签title = soup.find('ul',class_='detaila').text.strip()content = soup.find('ul',class_='detailc').text.strip()file_name = '{}.txt'.format(title)with open (file_name,'w',newline = '') as f:    f.write(content)

开多线程爬取飞华网实例

import reimport sysimport timeimport requestsimport threadingfrom urllib import parsefrom bs4 import BeautifulSoupori_url = 'http://dise.fh21.com.cn/department/illnesses.html'session = requests.session()root_urls = []  # 所有科室的绝对路径tag_urls = []  # 所有的绝对路径times = 16def main():    soup = request_get(ori_url)    for root in soup.find_all('ul', class_='level2'):        for tag in root.find_all('a', class_='link08 '):            root_urls.append(parse.urljoin(ori_url, tag['href']))    for url in root_urls:        soup = request_get(url)        if soup is 'pass':            #print('Skip this one url above.', file=sys.stderr)            continue        list_root = soup.find('div', class_='dise_list')        for a in list_root.find_all('a', class_='link08'):            target = a.get('href')            tag_urls.append(target)        page_tab = soup.find('div', class_='pageStyle')        if page_tab:            next_page = page_tab.find('span', class_='current').next_sibling            if next_page:                soup = request_get(parse.urljoin(ori_url, next_page.get('href')))                scrape_list_page(soup)    #print('A total of {} urls were scraped.'.format(len(tag_urls)), file=sys.stderr)    #print('--------    Start saving...    --------', file=sys.stderr)    count = 0    temp = len(tag_urls) // times    #print(temp)    #print(type(temp))    threads = []    while count < times:        t = threading.Thread(target=process_task, args=(tag_urls[(temp * count):(temp * (count + 1))],))        threads.append(t)        count += 1    if (temp * count) < len(tag_urls):        t = threading.Thread(target=process_task, args=(tag_urls[(temp * count):len(tag_urls)]))        threads.append(t)    for t in threads:        t.start()    for t in threads:        t.join()    for url in tag_urls:        soup = request_get(url)        detail_url = parse.urljoin(url, soup.select('p[data-seq="3"] > a')[0].get('href'))        save_txt(request_get(detail_url))    tag_urls.clear()    root_urls.clear()    #print('All completed.', file=sys.stderr)def request_get(url):    resp = session.get(url)    #print(url)    if resp.status_code is not 200:        #print('404', file=sys.stderr)        return 'pass'    return BeautifulSoup(resp.text, 'lxml')def scrape_list_page(soup):    for a in BeautifulSoup(str(list(soup.select('.dise_list_title')[1].next_siblings)), 'html.parser').select('.link08'):        target = a.get('href')        tag_urls.append(target)    page_tab = soup.find('div', class_='pageStyle')    if page_tab:            next_page = page_tab.find('span', class_='current').next_sibling            if next_page:                soup = request_get(parse.urljoin(ori_url, next_page.get('href')))                scrape_list_page(soup)def process_task(targets):    for url in targets:        time.sleep(1)        soup = request_get(url)        detail_url = parse.urljoin(url, soup.select('p[data-seq="3"] > a')[0].get('href'))        save_txt(request_get(detail_url))def save_txt(soup):    title = re.sub(r'[\\/\:\*\?"\<\>\|]', '@', (soup.find('div', class_='navigator').find_all('a', class_='link04')[2].text)).strip()    content = soup.find('ul', class_='detailc').text.strip()    file_name = '{}.txt'.format(title)    with open(file_name, 'w', encoding='utf-8', newline='') as f:        f.write(content)if __name__ == '__main__':    main()

阅读全文

0 0