Python3爬取网页数据存入MySQL

来源：互联网发布：什么叫网络建设与管理编辑：程序博客网时间：2024/06/01 21:16
不太会用这个编辑器，就把word截图过来了….
这里写图片描述
from bs4 import BeautifulSoupimport urllib.requestimport ssl #导入ssl认证东西import timeimport randomimport mysql.connectorprint('connect to mysql...')conn = mysql.connector.connect(host='localhost', user='root', passwd='dongxue0123', db='mysql',                                                   port=3306, charset='utf8')print("connected!")cursor = conn.cursor()cursor.execute("DROP TABLE IF EXISTS COMMM")sql = """CREATE TABLE COMMM(                           school_name char(255) NOT NULL ,                           teacher_name char(255) NOT NULL ,                           comm_date char(255),                           commm char(255),                           index(teacher_name))"""cursor.execute(sql)ssl._create_default_https_context = ssl._create_unverified_context #访问https证书失败，加上全局取消认证url="https://www.mysupervisor.org/viewforum.php?f=115&sid=9867c9c03c1efefa23dafda9e7d61d07"keep_request=True   #while_true=True 变量命名更清晰点while keep_request:    try:        page = urllib.request.urlopen(url, timeout=10).read()        keep_request = False        main = BeautifulSoup(page, "html.parser")        # print(school.title.string)    except:        print("reconnect to web..")  #print("重新连接")        time.sleep(1)for school_list in main.find_all('li',class_="row"):    x = 10 * random.random()    #print(x)    time.sleep(x)    #print(i.a.string) #输出学院名字，即输出标签中字符串    half_school_link=school_list.a.get('href')  #得到标签中href中的内容    schlool_link="https://www.mysupervisor.org"+half_school_link.strip('.') #link为每个学院网址    #print(schlool_link)    url1 = schlool_link    keep_request = True    while keep_request:        try:            page1 = urllib.request.urlopen(url1, timeout=20).read()            keep_request = False            school = BeautifulSoup(page1, "html.parser")            #print(school.title.string)        except:            #print("reconnect..")            time.sleep(1)    #################开始访问每个老师###############    for teacher_list in school.find_all('dl', class_="icon"):        count = teacher_list.dd.get_text()  ######这是老师评论数量        if (count[0] != '0'):  ###########如果评论数量不为零才可以输出            #print(i.a.string, j.a.string)            half_name_link = teacher_list.a.get('href')            name_link = "https://www.mysupervisor.org" + half_name_link.strip('.')            # print(name_link)            url2 = name_link            keep_requestt = True            while keep_requestt:                try:                    page2 = urllib.request.urlopen(url2, timeout=20).read()                    keep_requestt = False                    soup2 = BeautifulSoup(page2, "html.parser")                    #print(soup2.title.string)                except:                    #print("reconnect..")                    time.sleep(1)            for k in soup2.find_all('div', class_='inner'):                if k.find(class_="postprofile") or k.find(class_="content"):                    datee = k.find(class_="postprofile").get_text().strip()                    # .get_text()/.string有什么区别                    date = datee[11:]                    comment = k.find(class_="content").get_text().strip()                    print(school_list.a.get_text(), teacher_list.a.get_text() , date, comment)                    conn = mysql.connector.connect(host='localhost', user='root', passwd='dongxue0123', db='mysql',                                                   port=3306, charset='utf8')                    cursor = conn.cursor()                    if len(comment)>255:                        comment=comment[:255]                    insert_commm=("insert into COMMM(school_name,teacher_name,comm_date,commm)" "VALUE (%s,%s,%s,%s)")                    data_commm=(school_list.a.get_text(), teacher_list.a.get_text(),date, comment)                    cursor.execute(insert_commm,data_commm)                    #cursor.execute('insert into mysql(school, name_, date_,comment) value(%s,%s,%s,%s)',(i.a.string, j.a.string,date, comment))                    conn.commit()                    #print("finish!")                    #cursor.close()                    #conn.close()                # 解决游客+时间问题，datee输出为一个游客与时间的长字符串                # 只需截取时间，用到字符串的截取cursor.close()conn.close()
阅读全文
0 0