Python项目四：新闻聚合

来源：互联网发布：怎么在淘宝网上卖产品编辑：程序博客网时间：2024/05/17 06:08

代码地址：https://code.csdn.net/ranky2009/pythonsmallproject

编码中遇到的问题Note：
1.在编写中发现使用newnews时出现如下错误：
502 NEWNEWS command disabled by administrator

由于使用newnews命令出现问题，所以改为使用其他的命令替代。用group 和article命令，见项目代码。
2.使用如下code打印中文会报错误
代码url_test.py：
from urllib.request import urlopen

with urlopen('http://www.baidu.com') as f:
print(f.read().decode('utf-8'))
错误：
UnicodeEncodeError: 'gbk' codec can't encode character '\ufeff' in position 0: i
llegal multibyte sequence

错误环境是OS：WIN7， version of python：3.4.3，窗口：控制台cmd
后来发现，这个错误是由于在控制台中运行(python url_test.py)代码。
解决方法：
开始->所有程序->启动IDLE (Python 3.4 GUI - 64 bit)->File->New File，将以上代码拷贝到新文件中，F5运行代码，发现能正确读取中文。
注意：对于不同的网页，要看网页源码是怎样编码方式，有的是utf-8，有的是gb2312。
3.关于match
Match必须从正则表达式开始匹配，
import re
m = re.match(‘.end’, ‘1end’)
print(m.group()) #能够匹配到1end

n = re.match(‘.end, ‘12end’)
print(n.group()) #n为None，报错

对书上原始代码略做修改，如下：

from nntplib import NNTPfrom time import strftime, time, localtimefrom email import message_from_stringfrom urllib.request import urlopenimport textwrapimport redef wrap(string, max = 70):    return '\n'.join(textwrap.wrap(string)) + '\n'    class NewsAgent:    def __init__(self):        self.sources = []        self.destinations = []            def addSource(self, source):        self.sources.append(source)            def addDestination(self, dest):        self.destinations.append(dest)            def distribute(self):        items = []        for source in self.sources:            items.extend(source.getItems())                    for dest in self.destinations:            dest.receiveItems(items)                    class NewsItem:    def __init__(self, title, body):        self.title = title        self.body = bodyclass NNTPSource:    def __init__(self, servername, group, window):        self.servername = servername        self.group = group        self.window = window            def getItems(self):        server = NNTP(self.servername)        groupInfo = server.group(self.group)        for num in range(5):            id = str(int(groupInfo[2]) + num)            articleinfo = server.article(id)[1]            articleinfoStrings = []            for line in articleinfo.lines:                articleinfoStrings.append(line.decode())            message = message_from_string('\n'.join(articleinfoStrings))                        title = message['subject']            body = message.get_payload()                        if message.is_multipart():                body = body[0]                            yield NewsItem(title, body)                    server.quit()        class SimpleWebSource:    def __init__(self, url, titlePattern, bodyPattern):        self.url = url        self.titlePattern = re.compile(titlePattern)        self.bodyPattern = re.compile(bodyPattern)            def getItems(self):        text = urlopen(self.url).read()        textString = text.decode('utf-8')        title = self.titlePattern.findall(textString)        print(title)        body = self.bodyPattern.findall(textString)        #print(body[0])        for line in body:            if len(line) > 30 :                yield NewsItem(title[0], wrap(line))                breakclass PlainDestination:    def receiveItems(self, items):        for item in items:            print(item.title)            print('-' * len(item.title))            print(item.body)            class HTMLDestination:    def __init__(self, filename):        self.filename = filename            def receiveItems(self, items):        out = open(self.filename, 'w', encoding='utf-8')        print("""        <html>            <head>            <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>                <title>Today's New</title>            </head>            <body>            <h1>Today's News</h1>        """, file = out)                print('<ul>', file = out)        id = 0        for item in items:            id += 1            print('<li><a href="#%i">%s</a></li>' % (id, item.title), file = out)        print('</ul', file = out)                id = 0        for item in items:            id += 1            print('<h2><a name="%i">%s</a></h2>' % (id, item.title), file = out)            print('<pre>%s</pre>' % item.body, file = out)                    print("""            </body>        </html>        """, file = out)        def runDefaultSetup():    agent = NewsAgent()            bbc_url = 'http://m.cnr.cn/news/20150706/t20150706_519082828_tt.html?tt_group_id=4658836325'    bbc_title = '<title>(.+?)</title>'    bbc_body = '<p.*>(.+?)</p>'    bbc = SimpleWebSource(bbc_url, bbc_title, bbc_body)        agent.addSource(bbc)    clpa_server = 'nntpswitch.blueworldhosting.com'    clpa_group = 'comp.lang.python.announce'    clpa_window = 1    clpa = NNTPSource(clpa_server, clpa_group, clpa_window)            agent.addSource(clpa)            agent.addDestination(PlainDestination())    agent.addDestination(HTMLDestination('news.html'))            agent.distribute()        if __name__ == '__main__' : runDefaultSetup()

0 0