python3系列-爬虫解析

来源：互联网发布：人工智能计算器 iphone 编辑：程序博客网时间：2024/06/05 08:31

import urllib.requestfrom bs4 import BeautifulSoupurl="http://www.biquge5200.com/52_52542/"req = urllib.request.Request(url)response = urllib.request.urlopen(req)data = response.read()data=data.decode("gbk")soup=BeautifulSoup(data)soup=BeautifulSoup(str(soup.find(id='list')))list=soup.find_all('a');for l in range(9,len(list)):    print(list[l].get('href'),list[l].text)url="http://www.biquge5200.com/52_52542/150290199.html"req = urllib.request.Request(url)response = urllib.request.urlopen(req)data = response.read()data=data.decode("gbk")soup=BeautifulSoup(data)soup=BeautifulSoup(str(soup.find(id='content')))print(soup)

import urllib.requestfrom bs4 import BeautifulSoup# for u in range(1,1830):for u in range(0,1830):    print(u)    url="http://xiaohua.zol.com.cn/new/"+str(u)+".html"    req = urllib.request.Request(url)    response = urllib.request.urlopen(req)    data = response.read()    data=data.decode("gbk")    soup=BeautifulSoup(data)    soup=BeautifulSoup(str(soup.find(attrs={'class':'article-list'}))).find_all(attrs={'class':'article-summary'})    for i in soup:        u="http://xiaohua.zol.com.cn"+i.find(attrs={'class':'article-title'}).find_all('a')[0].get('href')        req = urllib.request.Request(u)        response = urllib.request.urlopen(req)        data = response.read()        data = data.decode("gbk")        cls = BeautifulSoup(data).find(attrs={'class':'wrapper location clearfix'}).find_all("a")[3].text        title=BeautifulSoup(data).find(attrs={'class':'article-title'}).text        content=BeautifulSoup(data).find(attrs={'class':'article-text'}).text        fcontent=cls+"|||"+title+"|||"+content+"\n"        # print(fcontent)        with open("c:/dz.txt", 'a') as file:            file.writelines(fcontent.replace(u'\xa0', u' '))

阅读全文

0 0