第一次学会使用正则表达式爬网页,纪念下,下一步学习动态网页爬取

来源:互联网 发布:ubuntu samba图形界面 编辑:程序博客网 时间:2024/06/07 14:26
import pandas as pdimport requestsfrom bs4 import BeautifulSoupimport re#获取保险链接以及名字url='http://www.bocommlife.com/sites/main/index.htm'page=requests.get(url,timeout=15)soup=BeautifulSoup(page.text,'lxml')p1=re.compile(r'''<option value="(.*万能*.*?)</option>''')data1=p1.findall(page.text)data1name_list={}for i in data1:    data2=i.split('1">')    name_list[re.compile(r'columnid=(\d*?)&page').findall(i)[0]]=re.compile(r'">(.*)').findall(i)[0]name_listjiaoyin=pd.DataFrame()test=[]for j in name_list.keys():#正则表达式初次使用    jiaoyin_data=pd.DataFrame()    #url='http://www.bocommlife.com/sites/main/list/jggg_ls.htm?columnid='+j+'&page=1'    url='http://www.bocommlife.com/sites/main/twainindex/jggg.htm?columnid='+j+'&page=1'    page=requests.get(url,timeout=15)    page.text    p_url=re.compile(r"href=(.*?)>查看价格公")    #page.text    url_num=(re.compile('\d+').findall(p_url.findall(page.text)[0]))[0]    url2='http://www.bocommlife.com/sites/main/list/jggg_ls.htm?columnid='+str(url_num)+'&page=1'    page=requests.get(url2,timeout=15)    p_num=re.compile(r'共(\d+)页')    num=int(p_num.findall(page.text)[0])    print(num)    test.append(url)    for i in range(1,num+1):                url='http://www.bocommlife.com/sites/main/list/jggg_ls.htm?columnid='+url_num+'&page='+str(i)        page=requests.get(url,timeout=15)        soup=BeautifulSoup(page.text,'lxml')        content=soup.find_all(class_='xwzx-list')                p1=re.compile(r'<th>(.*?)</th>')        columns=p1.findall(str(content))[:-1]        p2=re.compile(r'<td>(.*?)</td>')        data=p2.findall(str(content))        result_all=[]        result=[]        for i in range(len(data)):            k=data[i]            if i%4<3:                result.append(k)            if i%4==3:                result.append(k)                result_all.append(result)                result=[]        result_all=pd.DataFrame(result_all)        result_all.columns=columns        result_all['产品名称']=[name_list[j]]*len(result_all)        jiaoyin_data=jiaoyin_data.append(result_all)        print(result_all)        jiaoyin=jiaoyin.append(jiaoyin_data)jiaoyin.to_csv('交银.csv') 

原创粉丝点击