Python豆瓣爬虫,指定文件行数写入到文件中

来源:互联网 发布:python和ruby的区别 编辑:程序博客网 时间:2024/05/22 10:27
__author__ = 'huafeng'import osimport reimport timeimport codecsimport loggingimport urllib2import randomfrom math import ceilfrom bs4 import BeautifulSoupPATH = os.path.dirname(os.path.abspath(__file__))SLEEP_INTERVAL = random.randint(2, 5)class MovieActor:    def __init__(self):        self._gen_log()        self.proxy_list = []        # self._gen_proxy()        self.actor_content_text_list = []        self.timeout_url_list = []    def _gen_log(self):        logfile = os.path.join(PATH, 'log', 'douban_actor_cralwer.log')        self.logger = logging.getLogger(__name__)        self.logger.setLevel(logging.DEBUG)        log_file = logging.FileHandler(logfile)        log_file.setLevel(logging.DEBUG)        formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')        log_file.setFormatter(formatter)        self.logger.addHandler(log_file)    def _gen_proxy(self):        filename = os.path.join(PATH, 'sys', 'xici_proxy')        with codecs.open(filename, encoding='utf-8') as f:            self.proxy_list.extend([item.strip() for item in f.readlines()])    def parse_actor_content_url(self, url):        try:            html = urllib2.urlopen(url, timeout=15).read()        except:            try:                html = urllib2.urlopen(url, timeout=15).read()            except:                self.timeout_url_list.append(url)                self.logger.debug('request timeout in item_url:%s'%url)                return        soup = BeautifulSoup(html)        div_level_str = soup.find('div', id='content')        if not div_level_str:            self.logger.error('div_level do not match regular expression in url:%s'%url)            return        actor_summary = div_level_str.find('div', class_='bd') if not div_level_str.find('span', class_='all hidden') else div_level_str.find('span', class_='all hidden')        if not actor_summary:            self.logger.debug('actor_summary do not match re in item_url:%s'%url)        actor_content_text = actor_summary.text.strip()        if actor_content_text:            print actor_content_text            self.actor_content_text_list.append(actor_content_text)    def write_content_into_file(self):        timestamp = time.strftime('%Y_%m_%d_%H%M%S.txt')        filename = os.path.join(PATH, 'out', 'actor_con_out', timestamp)        actor_content_text_list = ["".join((item, '\n')) for item in self.actor_content_text_list]        with codecs.open(filename, mode='a', encoding='utf-8') as wf:            wf.writelines(actor_content_text_list)    def write_timeout_url(self):        filename = os.path.join(PATH, 'log', 'actor_info_timeout_url')        timeout_url_list = [item+'\n' for item in self.timeout_url_list]        with codecs.open(filename, mode='a', encoding='utf-8') as f:            f.writelines(timeout_url_list)    def main(self):        range_from = 1000000        range_to = 1339959        url_pattern = 'http://movie.douban.com/celebrity/%d/'        url_list = [url_pattern%item for item in range(range_from, range_to+1)]        # print len(url_list), url_list[-1]        url_count = range_to - range_from        con_count_write_in_file = 2000        file_count = url_count/2000#338000        for file_point in range(file_count):            for url_point in range(file_point*con_count_write_in_file, (file_point+1)*con_count_write_in_file):                url = url_list[url_point]                self.parse_actor_content_url(url)                time.sleep(SLEEP_INTERVAL)            self.write_content_into_file()            self.actor_content_text_list[:] = []            if self.timeout_url_list:                self.write_timeout_url()        url_left = url_count - file_count*con_count_write_in_file#339959-338000        for i in range(1,url_left+1):            url = url_list[-i]            self.parse_actor_content_url(url)            time.sleep(SLEEP_INTERVAL)        self.write_content_into_file()        self.actor_content_text_list[:] = []        if self.timeout_url_list:            self.write_timeout_url()if __name__ == '__main__':    actor = MovieActor()    actor.main()
main函数中,指定每2000条数据写入到时间戳文件中。不用每次append操作是判定该时数据行数已经达到具体的设定值,而且如果在append时判断其行数来决定是否要写入文件的方法,会漏掉最多2000条数据。所以该方法可以避免数据的丢失。。。
0 0
原创粉丝点击