爬虫lianjia

来源:互联网 发布:中国社交网络发展历程 编辑:程序博客网 时间:2024/06/03 23:31
#encoding:utf8import requestsfrom lxml import etreeimport reimport mysql.connector# db = mysql.connector.connect(user='root',password='root',port=3306,host='127.0.0.1',db='233')# cursor = db.cursor()for k in range(1,101):    url = 'http://bj.lianjia.com/chengjiao/pg'+str(k)    content=requests.post(url).text.replace('\xa9','')    demo = etree.HTML(content)    list3 = demo.xpath('/html/body/div[4]/div[1]/ul/li/div/div[1]/a/text()')    list2 = demo.xpath('/html/body/div[4]/div[1]/ul/li/div/div[2]/div[1]/text()')    list4 = demo.xpath('/html/body/div[4]/div[1]/ul/li/a/@href')    for i in range(len(list3)):        wheres = list2[i].replace('\xa0','').split('|')[0]        lista = list3[i].split()        wheres = wheres.strip()        lista.append(wheres)        lista.append(list4[i])        hrefs = lista[4]        orientations = lista[3]        areas = lista[2]        styles = lista[1]        names = lista[0]        sql1="select  EXISTS (select 1 from lianjia where `href`='"+hrefs+"')"        cursor.execute(sql1)        status = cursor.fetchall()        if not status[0][0]:            sql = "insert into lianjia(`href`,`name`,`style`,`area`,`orientation`) VALUES('"+hrefs+"','"+names+"','"+styles+"','"+areas+"','"+orientations+"')"            cursor.execute(sql)            db.commit()        else:            print(names+'已经存在')
原创粉丝点击