我的第一个Python爬虫

来源：互联网发布：iis绑定域名编辑：程序博客网时间：2024/06/05 17:48
参考了晚上很多资料
就是抓取http://m.i21st.cn/speaking/oraltraining_1.html这个网站上的英语资料，没事练练英语~哈哈~
# -*- coding: utf-8 -*-import urllib2import urllibimport reimport threadimport timeclass HTML_Model:            def __init__(self):          self.page = 1          self.pages = []         #self.myTool = HTML_Tool()          self.enable = False                def GetPage(self,page):          myUrl = "http://m.i21st.cn/speaking/oraltraining_" + page +".html"        myResponse  = urllib2.urlopen(myUrl)          myPage = myResponse.read()          #encode的作用是将unicode编码转换成其他编码的字符串          #decode的作用是将其他编码的字符串转换成unicode编码          unicodePage = myPage.decode("utf-8")                          # 找出所有class="content"的div标记          #re.S是任意匹配模式，也就是.可以匹配换行符          myItems = re.findall("</div><a.*?href='(.*?)'.*?class='h3'>(.*?)</a>",unicodePage,re.S)                items = []          for item in myItems:                items.append([item[0].replace("\n",""),item[1].replace("\n","")])          return items                  def LoadPage(self):                  while self.enable:                          if len(self.pages) < 2:                  try:                                            myPage = self.GetPage(str(self.page))                      self.page += 1                      self.pages.append(myPage)                  except:                      print '无法显示新的资料！'              else:                  time.sleep(1)                 def ShowPage(self,q,page):          for items in q:              print u'第%d页' % page , items[1]              #——————————————————————            myUrl = "http://m.i21st.cn" + str(items[0])            myResponse  = urllib2.urlopen(myUrl)              myPage = myResponse.read()              #encode的作用是将unicode编码转换成其他编码的字符串              #decode的作用是将其他编码的字符串转换成unicode编码              unicodePage = myPage.decode("utf-8")            #myItems = re.findall("</div><a.*?href='(.*?)'.*?class='h3'>(.*?)</a>",unicodePage,re.S)            temp = re.findall("<a.*?id='contentbegin'.*?name='contentbegin'></a>(.*?)</div>",unicodePage,re.S)                        temp1=temp[0].encode("utf-8")                       temp2=re.findall("<br.*?/>(.*?)</p>",temp1,re.S)                       temp3=temp2[0].decode("utf-8")                       #myItems = re.findall("<br.*?/>[\n](.*?)<br.*?/>(.*?)",temp1,re.S)                                    temp4 =str(temp3.encode("gbk"))                        temp4=temp4.replace("'","")            temp4=temp4.replace("<br />","")            temp4=temp4.replace("<strong>","")            temp4=temp4.replace("</strong>","")            temp4=temp4.replace("&rsquo","")            temp4=temp4.replace("&ldquo","")            temp4=temp4.replace("&rdquo","")            text=temp4.replace(";","'")            #————————————————————————————————————————          # print self.myTool.Replace_Char(items[1])            print text              myInput = raw_input()              if myInput == "quit":                  self.enable = False                  break                 def Start(self):          self.enable = True          page = self.page             print u'正在加载中请稍候......'                     # 新建一个线程在后台加载段子并存储          thread.start_new_thread(self.LoadPage,())                            while self.enable:                          if self.pages:                  nowPage = self.pages[0]                  del self.pages[0]                  self.ShowPage(nowPage,page)                  page += 1        #----------- 程序的入口处 -----------  print u""" ---------------------------------------    程序：英语学习—爬虫   语言：Python2.7   作者：xiantian   功能：按下回车依次浏览今日的英语资料 --------------------------------------- """        print u'请按下回车浏览今日新的内容：'  raw_input(' ')  myModel = HTML_Model()  myModel.Start()
0 0