搜狗微信公众号文章反爬虫完美攻克

来源：互联网发布：升级到mac os sierra 编辑：程序博客网时间：2024/05/29 04:59

很简单，selenium + chromedriver，搜狗的部分直接在chrome模拟浏览器内部操作即可，而mp.weixin.qq.com则是腾讯的了，不反爬虫，用urllib requests等等即可。

需要扫码登陆，不扫码只能采取10页数据

from selenium import webdriverimport timefrom bs4 import BeautifulSoupimport threadingdriver = webdriver.Chrome()driver.get("http://weixin.sogou.com/")driver.find_element_by_xpath('//*[@id="loginBtn"]').click()find = input("输入你想查找的关键词")driver.find_element_by_xpath('//*[@id="query"]').send_keys("%s"%find)driver.find_element_by_xpath('//*[@id="searchForm"]/div/input[3]').click()time.sleep(2)url_list = []while True:    page_source = driver.page_source    #print(page_source)    bs_obj = BeautifulSoup(page_source,"html.parser")    one_url_list = bs_obj.findAll("div",{"class":"txt-box"})    for url in one_url_list:        url_list.append(url.h3.a.attrs['href'])        #print(url.h3.a.attrs['href'])    next_page = "http://weixin.sogou.com/weixin" + bs_obj.find("a",{"id":"sogou_next"}).attrs['href']    driver.get(next_page)    time.sleep(1)def get_img(url,num,connect,cursor):    response = requests.get(url,headers = header).content    content = str(response,encoding = "utf-8")    bs_obj = BeautifulSoup(content,"html.parser")    img_list = bs_obj.findAll("img")    count = 0    for img in img_list:        try:            imgurl=get_total_url(img.attrs["data-src"])            store_name = "%s"%url_num+"%s"%count            path = r"C:\Users\Mr.Guo\Pictures\weixin"            check_mkdir(path)            urllib.request.urlretrieve(imgurl,r"C:\Users\Mr.Guo\Pictures\weixin\%s.jpeg" %store_name)            insert_into_table(connect,cursor,store_name,html)            count += 1        except Exception as e:            passfor url_num in range(len(url_list)):        t = threading.Thread(target = get_img,args = (url_list[url_num],url_num,connect,cursor,))        t.start()

阅读全文

0 0