python处理3
来源:互联网 发布:泰和安主机编程程序 编辑:程序博客网 时间:2024/05/01 08:33
import urllib.parseimport urllib.requestfrom bs4 import BeautifulSoupimport randomimport timedef randHeader(): head_connection = ['Keep-Alive', 'close'] head_accept = ['text/html, application/xhtml+xml, */*'] head_accept_language = ['zh-CN,fr-FR;q=0.5', 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3'] head_user_agent = ['Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)', 'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1', 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3', 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12', 'Opera/9.27 (Windows NT 5.2; U; zh-cn)', 'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0', 'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)', 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ', 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER', 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11'] header = { 'Connection': head_connection[0], 'Accept': head_accept[0], 'Accept-Language': head_accept_language[1], 'User-Agent': head_user_agent[random.randrange(0, len(head_user_agent))] } return headerdef getDataById(queryId): req = urllib.request.Request(url="https://www.amazon.com/dp/"+str(queryId) , headers=randHeader()) webpage = urllib.request.urlopen(req) html = webpage.read() soup = BeautifulSoup(html, 'html.parser') content = soup.find_all("span" , id = "asTitle") return content[0].stringif __name__ == '__main__': qs = [] with open('asin.txt', 'r') as f: for line in f.readlines(): qs.append(line.strip()) n = len(qs) k = 0 file_object = open("data/amazon" + str(k) + ".txt" , 'a') file_object.write("asin, state\n") for i in range(n): print("运行第" + str(i) + "个数据" ) print("asin = " + qs[i]) if i % 100 == 0 and i > 0 : k += 1 file_object = open("data/amazon" + str(k) + ".txt" , 'a') file_object.write("asin, state\n") state = getDataById(qs[i]) print("状态 = " + state +"\n") file_object.write(qs[i] + "," + state+"\n") file_object.close() # for q in qs: # state = getDataById(q) # file_object.write(q + "," + state+"\n") # file_object.close()
阅读全文
0 0
- python处理3
- Python自然语言处理 3 处理原始文本
- Python 3 中文路径处理
- Python与图像处理3
- 3、Python 处理 JSON 数据
- 【Python】Python处理Excel
- [Python] Python日程处理
- Python笔记(3)异常处理
- NLP with python 3 处理原始文本
- Python图像处理(3):视频显示
- 3-python图像处理之NumPy
- Python 3基础教程25-异常处理
- Python 3 的数据类型和基本处理
- 【python图像处理】python绘制3D图形
- 【python学习记录】-3-Python图像处理库:Pillow 初级教程
- 【Python】Python处理Json文件
- python文件处理(1)
- python异常处理
- 提车注意事项清单
- css-如何让你的元素居中?
- 基于CentOS搭建 Git 服务
- MVC dropdownlist二级联动
- 如何跟踪Log4j或Sl4j加载哪个配置文件进行初始化
- python处理3
- 方法调用和数组概念以及for each
- C之有趣-绘制正弦曲线
- Linux下通过ODBC连接数据库
- python 案例 007 (copy list 的两种模式)
- java求数组中最大值最小值
- Java提高篇——equals()与hashCode()方法详解
- 解决端口被占用问题
- maven项目springMVC+spring+mybatis集成,实现人员登陆