Python 提取新浪微博的博文中的元素(包含Text, Screen_name)

来源:互联网 发布:工业企业数据库 161 编辑:程序博客网 时间:2024/04/30 08:08

CODE:

#!/usr/bin/python # -*- coding: utf-8 -*-'''Created on 2014-7-8@author: guaguastd@name: extractWeiboEntities.py'''if __name__ == '__main__':        import json        # get weibo_api to access sina api    from sinaWeiboLogin import sinaWeiboLogin    sinaWeiboApi = sinaWeiboLogin()        # import sinaWeibo    from sinaWeibo import extractWeiboEntities        # import sinaWeoboStatuses    from sinaWeiboStatuses import publicTimeline        # get the new 200 weibo    statuses = publicTimeline(sinaWeiboApi, 200)    status_texts,screen_names,words = extractWeiboEntities(statuses)                       # Explore the first 5 items for each...    print json.dumps(status_texts[0:5], indent=1)    print json.dumps(screen_names[0:5], indent=1)    print json.dumps(words[0:5], indent=1)

RESULT:

[ "[\u795e\u9a6c]2014\u590f\u5b63\u65b0\u6b3e\u5973\u88c5\u97e9\u56fd\u4e1c\u5927\u95e8\u4ee3\u8d2d \u65e0\u8896t\u6064\u5973\u4fee\u8eab\u5706\u9886\u663e\u7626\u96ea\u7eba\u4e0a\u8863  http://t.cn/RvCUVwB",  "\u52ff\u5fd8\u56fd\u803b\uff0c\u632f\u5174\u4e2d\u534e\uff01\u81f3\u4eca\u65e0\u6cd5\u5fd8\u8bb0\u65e5\u5bc7\u523a\u5200\u4e0a\u7684\u5a74\u513f\uff01\uff01\uff01\uff01\uff01\uff01\uff01\u75db\u5fc3\u75be\u9996 \u6211\u5206\u4eab\u4e86http://t.cn/Rvdm1cn",  "\u7626\u8138\u7684\u4ea7\u54c1\u7528\u8fc7\u597d\u591a\u597d\u591a\uff0c\u603b\u662f\u4ee5\u89c1\u4e0d\u5230\u6548\u679c\u7ed3\u5c40\uff01\u4f46\u662f\u4e00\u76f4\u8ffd\u6c42V\u8138\u7684\u5fc3\u6ca1\u6539\u53d8\u8fc7\uff01\u76f4\u5230\u6211\u627e\u5230\u4e86\u8fd9\u4e2a\u4f70\u8349\u4e16\u5bb6V\u8138\u795e\u5668\uff01[\u5fc3]\u6d82\u4e0a\u7acb\u523b\u5c31\u6709\u7d27\u81f4\u611f\uff0c\u7761\u524d\u6d82\u62b9\uff0c\u9192\u6765\u770b\u5230\u7684\u6548\u679c\u4f60\u771f\u7684\u4f1a\u5c16\u53eb\u7684\uff01[\u7231\u4f60]\u54ea\u91cc\u4e0d\u7626\u6d82\u54ea\u91cc\uff0c\u518d\u4e5f\u4e0d\u7528\u62c5\u5fc3\u6211\u7684\u5305\u5b50\u8138\u5566\uff01\u7f8e\u4e3d\u4fcf\u4f73\u4eba\u63a8\u8350\uff1a[\u4e2d\u7bad]http://t.cn/RvntLNh",  "\u5a01\u6b66MAERZ2014\u6625\u88c5\u65b0\u6b3e\u7537\u88c5\u957f\u8896\u886c\u886b \u97e9\u7248\u4fee\u8eab\u7537\u58eb\u7ecf\u5178\u7eaf\u68c9\u683c\u5b50\u886c\u8863\u6f6e  http://t.cn/RvCyu61",  "[\u563b\u563b]2014\u590f\u88c5\u65b0\u6b3e\u5973\u58eb\u788e\u82b1\u886c\u886b\u4fee\u8eab\u5927\u7801\u957f\u8896\u7eaf\u68c9\u5370\u82b1\u886c\u8863\u97e9\u7248\u4e0a\u8863\u6f6e  http://t.cn/RvCUIw5"][ "\u53e4\u6708\u79cb\u666f",  "Lcineferit",  "\u7efd\u653e\u9ec4\u8272\u7261\u4e39aa",  "\u4e8c\u9505\u9505\u4e8c\u59d0\u59d0",  "lang\u6d6a\u6f2b\u66f2"][ "[\u795e\u9a6c]2014\u590f\u5b63\u65b0\u6b3e\u5973\u88c5\u97e9\u56fd\u4e1c\u5927\u95e8\u4ee3\u8d2d",  "\u65e0\u8896t\u6064\u5973\u4fee\u8eab\u5706\u9886\u663e\u7626\u96ea\u7eba\u4e0a\u8863",  "http://t.cn/RvCUVwB",  "\u52ff\u5fd8\u56fd\u803b\uff0c\u632f\u5174\u4e2d\u534e\uff01\u81f3\u4eca\u65e0\u6cd5\u5fd8\u8bb0\u65e5\u5bc7\u523a\u5200\u4e0a\u7684\u5a74\u513f\uff01\uff01\uff01\uff01\uff01\uff01\uff01\u75db\u5fc3\u75be\u9996",  "\u6211\u5206\u4eab\u4e86http://t.cn/Rvdm1cn"]


0 0
原创粉丝点击