URL分析小结

来源：互联网发布：java小游戏程序代码编辑：程序博客网时间：2024/06/08 09:06

今天总算得空，着手进行了一下自己想了很久的想法，虽然只实现了简易版（想法比较复杂，这次只是实现一个原型）。但是收获颇多。因此总结一下，权作笔记罢。

想法：

想法说起来很简单，凭借浏览器里的历史url，对自己的上网习惯进行分析。本次知识实现了一小部分，以后还要进行深入分析，敬请期待！！

本次实现：

本次使用python实现了对浏览器历史记录的简单的统计和生成pdf。

知识：

URL的获取：

我获取的是chrome的历史记录，chrome的历史记录是在C:\Users\用户名\AppData\Local\Google\Chrome\User Data\Default\History；我的实验环境是win7系统，chrome版本20.0.1096.1，默认的安装目录；chrome历史记录存储使用轻量级数据库sqlite存储。具体的读取等下看程序。

pdf的生成：

我使用的reportLab程序包，它是python的模块，我没有安装，直接用的源码，当然，要使用高级功能的话恐怕得安装了。下载地址：http://www.reportlab.com/ftp/ 我下载的是reportlab-2.6.zip，下载后直接将src下的reportlab文件夹拷贝到site-package目录下即可使用import导入。

代码：

查看python路径：

>>> import sys,pprint>>> pprint.pprint(sys.path)['', 'D:\\CodeSoft\\python2.7\\Lib\\idlelib', 'C:\\Windows\\system32\\python27.zip', 'D:\\CodeSoft\\python2.7\\DLLs', 'D:\\CodeSoft\\python2.7\\lib', 'D:\\CodeSoft\\python2.7\\lib\\plat-win', 'D:\\CodeSoft\\python2.7\\lib\\lib-tk', 'D:\\CodeSoft\\python2.7', 'D:\\CodeSoft\\python2.7\\lib\\site-packages']

D:\CodeSoft\python2.7是我的python安装路径，不同的安装这一点是不一样的。一般情况下将第三方的模块放入到最后那个site-packages目录下导入。

urlcount.py代码：

#!/usr/bin/env python# -*- coding: gb18030 -*-import reimport timeimport mathimport sqlite3import string#urlCount作用：#用于统计url出现的次数#统计之前先进行截取#统计完成后写入到一个文件中class urlCount:    def __init__(self,history_db,out_file):        self.history_D = history_db        self.out_F = out_file        self.query = 'select url,visit_count from urls'        self.url_Count = {}    def __cutUrl(self,url):        pos = url.find('/',10)        if not pos == -1:            url = url[0:pos]        return url    def countUrl(self):        conn = sqlite3.connect(self.history_D)        cursor = conn.cursor()        cursor.execute(self.query)        for row in cursor.fetchall():            url,visit_count = row            url = self.__cutUrl(url)                        self.url_Count.setdefault(url,0)            self.url_Count[url] += visit_count        conn.close()    def writeToFile(self):        sortedUC = sorted(self.url_Count.items(),key = lambda d:d[1],reverse = True)        outClient = open(self.out_F,'w')                for item in sortedUC:            item_str = item[0] + '\t' + str(item[1]) + '\n'            item_str = item_str.encode('gb18030')            outClient.write(item_str)        outClient.close()            def factory():    history_db = r'F:\history\History'    out_file = r'F:\history\count_result'    uc = urlCount(history_db,out_file)    return ucif __name__ == '__main__':    uc = factory()    uc.countUrl()    uc.writeToFile()

chart.py

#!/usr/bin/env python# -*- coding: gb18030 -*-import reimport stringfrom reportlab.graphics.shapes import *from reportlab.graphics.charts.lineplots import LinePlotfrom reportlab.graphics.charts.textlabels import Labelfrom reportlab.graphics.widgets.markers import makeMarkerfrom reportlab.graphics import renderPDF#chart作用：#将排名最靠前的url绘制成图表，并生成pdfclass ChartCreator:    def __init__(self, data_file, chart_pdf):        self.data_F = data_file        self.chart_P = chart_pdf        self.data = {}    def createChart(self):                chart_width = 300        chart_height = 125        pdf_width = 400        pdf_height = 200 + 20 * len(self.data['x'])        drawing = Drawing(pdf_width, pdf_height)        lp = LinePlot()        lp.x = 50        lp.y = pdf_height - 200 + 50        lp.height = chart_height        lp.width = chart_width        lp.data = [zip(self.data['x'],self.data['y'])]        lp.lineLabelFormat = '%2.0f'        lp.xValueAxis.valueSteps = [0] + self.data['x']        lp.xValueAxis.valueMin = 0                lp.yValueAxis.valueMin = 100        lp.yValueAxis.valueMax = 800        lp.lines[0].strokeColor = colors.blue        lp.lines[0].symbol = makeMarker('FilledCircle')                drawing.add(lp)        for num,url in zip( self.data['x'], self.data['urls'] ):            item_str = str(num) + ':  ' + url            str_x = 50            str_y = (len(self.data['x']) - num + 1) * 20            drawing.add(String(str_x, str_y, item_str, fontSize = 12 ))        renderPDF.drawToFile(drawing,self.chart_P,'url count')                def readData(self):        inClient = open(self.data_F,'r')        reg = '[(\r)(\t)(\n)]+'        i = 0        self.data.setdefault('x',[])        self.data.setdefault('y',[])        self.data.setdefault('urls',[])        for line in inClient:            i += 1            url,num = re.split(reg,line)[0:2]            num = int(num)            if num < 150:                break            self.data['x'].append(i)            self.data['y'].append(num)            self.data['urls'].append(url)        inClient.close()def factory():    data_file = r'F:\history\count_result'    chart_pdf = r'F:\history\chart.pdf'    cc = ChartCreator(data_file,chart_pdf)    return ccif __name__ == '__main__':    cc = factory()    cc.readData()    cc.createChart()

结果：

pdf只有一页，截图如下：

可以看到，本人平素比较喜欢刷微博和人人，喜欢上豆瓣，喜欢使用谷歌的产品等等。Ps：看来以后要少刷微博神马的了，毕竟哥是一个求上进的人呐。