中英人寿爬虫爬取，使用selenium自动化和正则表达式

来源：互联网发布：源氏物语知乎编辑：程序博客网时间：2024/04/28 14:32
import reimport requestsfrom bs4 import BeautifulSoupimport pandas as pdimport numpy as npfrom selenium import webdriverfrom selenium.webdriver.common.keys import Keysfrom selenium.webdriver.common.action_chains import ActionChainsfrom selenium.webdriver.support.select import Selecturl='https://cnhome.aviva-cofco.com.cn:8080/web2013/customer/customer_infoSearch.jsp?catid=6748%7C6786%7C7074%7C7124%7C7131&title=&month1=&month2='url='https://cnhome.aviva-cofco.com.cn:8080/web2013/customer/customer_infoSearch.jsp?catid=6748|6786|7074|7124|7131'driver = webdriver.Chrome()driver.get(url)page=driver.page_sourcep_title=re.compile(r"""<option value=.*>(.*?)</option>""")title=p_title.findall(page)[1:]zhongying=pd.DataFrame()for k in title:    url='https://cnhome.aviva-cofco.com.cn:8080/web2013/customer/customer_infoSearch.jsp?catid=6748%7C6786%7C7074%7C7124%7C7131&title='+k+'&month1=&month2='    driver.get(url)    page=driver.page_source    p_page_num=re.compile(r'<span>(\d+)</span>')    page_num=p_page_num.findall(page)    print(page_num)    if len(page_num)>0:       page_num=pd.Series([int(i) for i in page_num]).max()    else:        page_num=1    result_all=pd.DataFrame()    for i in range(page_num):        page=driver.page_source        p_columns=re.compile(r"""<th.*>(.*?)</th>""")        columns=p_columns.findall(page)            p_content=re.compile(r"""<td>(.*?)</td>""")         content=p_content.findall(page)        result=[]        result1=[]        for j in range(len(content)):            if j%5<4:                result1.append(content[j])            if j%5==4:                result1.append(content[j])                result.append(result1)                result1=[]                result=pd.DataFrame(result)        result.columns=columns        result_all=result_all.append(result)        try:           driver.find_element_by_link_text('下一页').click()           except:           pass    p_name=re.compile(r"""<div style="color:#57B648.*">(.*?)</div>""")    name=p_name.findall(page)    result_all['产品名称']=name*len(result_all)    zhongying=zhongying.append(result_all)driver.quit()zhongying.to_csv('中英.csv')
阅读全文
0 0