采集文献不认识单词上传扇贝网
来源:互联网 发布:免费网络摄像头直播 编辑:程序博客网 时间:2024/04/30 03:07
程序分为三个功能块
1、sql() 数据库功能块,这个是下载网上源代码,自己加了些东西,其源码来源于此处点击打开链接。
2、txt()文本处理模块,搜寻单词,对比本地数据库,不在本地数据库中单词,去有道翻译,得到翻译,去掉人名,地名,或者短语,写入到数据库,并写入到txt文件(上传用)。
3、url()翻译模块。
另外做了数据库,从网上下载的单词txt文件,转入数据库中,库中表包括四级和六级。
#!/usr/bin/python#-*- coding: utf-8 -*-import osimport sysimport sqlite3import timeimport codecsimport urllib.requestimport urllib.parseimport jsonclass sql(object):'''test.db中有三个表,第一个是cet_4, 第二个是conquer_word, 第三个是temp, 第四个cet_6id integer integer integer integeren text text text text cn text real text text add_time text text text textconquer INT int int intclassify .. .. .. text '''def __init__(self):'''初始化方法'''#数据库文件绝句路径global DB_FILE_PATHDB_FILE_PATH = os.getcwd()+'\\test.db'#数据库表名称global TABLE_NAMETABLE_NAME = 'cet_4'#是否打印sqlglobal SHOW_SQLSHOW_SQL = Trueprint('show_sql : {}'.format(SHOW_SQL))'''#self.conn=sqlite3.connect(path)#test.db####只能初始调用,本打算去掉get_conn功能#self.cursor=self.conn.cursor()'''def get_conn(self,path):'''获取到数据库的连接对象,参数为数据库文件的绝对路径如果传递的参数是存在,并且是文件,那么就返回硬盘上面改路径下的数据库文件的连接对象'''conn = sqlite3.connect(path)#test.dbif os.path.exists(path) and os.path.isfile(path):return conndef get_cursor(self,conn):'''该方法是获取数据库的游标对象,参数为数据库的连接对象如果数据库的连接对象不为None,则返回数据库连接对象所创建的游标对象;否则返回一个游标对象,该对象是内存中数据库连接对象所创建的游标对象'''if conn is not None:return conn.cursor()else:return self.get_conn('').cursor()def create_table(self, sql):'''创建数据库表:'''if sql is not None and sql != '':conn=self.get_conn(DB_FILE_PATH)cu = self.get_cursor(conn)if SHOW_SQL:print('执行sql:[{}]'.format(sql))cu.execute(sql)conn.commit()print('创建数据库表[]成功!')self.close_all(conn, cu)else:print('the [{}] is empty or equal None!'.format(sql))def drop_table(self,table):'''如果表存在,则删除表,如果表中存在数据的时候,使用该方法的时候要慎用!'''if table is not None and table != '':sql = 'DROP TABLE IF EXISTS ' + tableif SHOW_SQL:print('执行sql:[{}]'.format(sql))conn=self.get_conn(DB_FILE_PATH)cu = self.get_cursor(conn)cu.execute(sql)conn.commit()print('删除数据库表[{}]成功!'.format(table))self.close_all(conn, cu)else:print('the [{}] is empty or equal None!'.format(sql)) def close_all(self,conn,cu):'''关闭数据库游标对象和数据库连接对象'''try:if cu is not None:cu.close() finally:if conn is not None:conn.close() def save(self, sql, data):'''插入数据'''if sql is not None and sql != '':if data is not None:conn=self.get_conn(DB_FILE_PATH)cu = self.get_cursor(conn)for d in data:if SHOW_SQL:print('执行sql:[{}],参数:[{}]'.format(sql, d))cu.execute(sql, d)conn.commit()self.close_all(conn, cu)else:print('the [{}] is empty or equal None!'.format(sql))def fetchone(self,sql, data):'''查询一条数据''' #cursor = conn.execute("SELECT * from cet_4 where cet_4.en like ?",('ab%',))模糊查询语句if sql is not None and sql != '':conn=self.get_conn(DB_FILE_PATH)cu = self.get_cursor(conn)if data is not None and data!='':d = (data,) if SHOW_SQL:print('执行sql:[{}],参数:[{}]'.format(sql, data))cu.execute(sql, d)r = cu.fetchall()if len(r) > 0:return True##########################for e in range(len(r)):# print(r[e])######################### else:return Falseprint('the [{}] equal None!'.format(data))else:print('执行sql:[{}]'.format(sql))cu.execute(sql)r = cu.fetchall()if len(r)>0:return len(r)else:return 0#print('the [{}] is empty or equal None!'.format(sql))def update(self,sql, data):'''更新数据'''if sql is not None and sql != '':if data is not None:conn=self.get_conn(DB_FILE_PATH)cu = self.get_cursor(conn)for d in data:if SHOW_SQL:print('执行sql:[{}],参数:[{}]'.format(sql, d))cu.execute(sql, d)conn.commit()self.close_all(conn, cu)else:print('the [{}] is empty or equal None!'.format(sql))def delete(self,sql, data):'''删除数据'''if sql is not None and sql != '':if data is not None:conn=self.get_conn(DB_FILE_PATH)cu = self.get_cursor(conn)for d in data:if SHOW_SQL:print('执行sql:[{}],参数:[{}]'.format(sql, d))cu.execute(sql, d)conn.commit()self.close_all(conn, cu)else:print('the [{}] is empty or equal None!'.format(sql))def get_structure(self):'''显示数据库结构'''conn=self.get_conn(DB_FILE_PATH)cu = self.get_cursor(conn)#c.execute("DROP TABLE IF EXISTS temp;")cu.execute("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name")print(cu.fetchall())####查看表的结构##cu.execute("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name")for i in cu.fetchall():cu.execute("PRAGMA table_info({})".format(i[0])) ##print(cu.fetchall()) #删除表中数据#cu.execute("DELETE FROM temp ") #conn.commit()##查看表中数据##cu.execute('SELECT * FROM temp')row=cu.fetchall()for j in row:try:print (j)except:pass##模糊查询##for table in ['cet_4','cet_6','temp']:cu.execute("SELECT en from [{}] where [{}].en like?".format(table,table),('%appl%',)) #cu.execute("SELECT * from cet_4 where cet_4.en like ?",('%appl%',))#模糊查询语句 row=cu.fetchall()for l in row:print(table,l)####################################################################### c.execute("PRAGMA table_info(cet_4)") #### print(c.fetchall()) ########查看表的结构 ## #####################################################################class Txt(object):"""处理相关txt数据"""def __init__(self):'''vvvv'''def deal_txt(self):temp_dic=[]dict_account={}num=0 #每篇文章计算个数path="C:\\Users\\John\\Desktop\\work\\paper"for f in os.listdir(path):if os.path.isfile(path+"\\"+f) and '.txt' in f :file_name=path+"\\"+felse:continuewith codecs.open (file_name,encoding = "utf-8") as ff: #避免编码的问题s=ff.read()ff.close()dele=[' ','!','%','&','*','(',')',');','[',']','{','}','\\','|','/','//','?',':','"','“','”',':',':',';',';',',',',','.','。','`','·','~','-','-','_','+','=','——','-']#尽可能的去除符号for d in dele:s=s.strip(d)ls=s.split(' ')ls=ls[1:] #ls[0]包含'u/ffff'字符,去除for i in ls :for d in dele:i=i.strip(d)#尽可能去除非单词部分if ' ' in i or len(i)<=4:continuev={'ing':'','ies':'y','tions':'tion','\'s':'','ied':'y','ed':'','s':''}for j in ['ing','ied','ies','tions','ed','\'s','s']:if i[-len(j):]==j:i=i.replace(j,v[j])break #跳出内循环if len(i)<=5 or len(i)>10 or 'http:' in i or '’s' in i or not i.isalpha() :continuetry:print(i)except:continueif i.lower() not in temp_dic:temp_dic.append(i.lower())if i.lower() not in dict_account:time.sleep(0.000001)dict_account[i.lower()]=1else:time.sleep(0.000001)dict_account[i.lower()]=dict_account[i.lower()]+1s=''ls=[]num=-(num-len(temp_dic))print(file_name,"\n","本篇文章符合要求单词个数是:%d"%num,len(temp_dic)) print (len(temp_dic))print(sorted(dict_account.items(),key =lambda a:a[1],reverse=True))return temp_dic def save_txt(self):sql1=sql()temp_dic=self.deal_txt()translate=url()temp_word_list=[]da=''j=sql1.fetchone('SELECT * FROM temp',da)+1with open (os.getcwd()+"\\"+time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime())+".txt",'w') as f:for i in temp_dic:#基本数据库中存在则continue cet_4.en like ?",('ab%',))boolean=Falsefor table in ['cet_4','cet_6','temp']:boolean= boolean or sql1.fetchone("SELECT en from [{}] where [{}].en like?".format(table,table),i+'%')if boolean:print("%s数据库中存在%s!或者存在其变形"%(table,i))break if boolean: continueelse:result=translate.get_fanyi(i)if result==False or result.isalnum() or '姓氏' in result or '人名' in result or '女子名' in result or '男子名' in result :print('不是单词,是短语或者其他')continueelse:try:f.writelines(i+'\n')temp_word_list.append((i,result))sq="INSERT INTO temp VALUES (?,?,?,?,?)"add_time= time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) data=[(j,i,result,add_time,1)]sql1.save(sq, data)j+=1except:passreturn temp_word_listclass url(object):"""从网页有道出获取中文"""def __init__(self):'''vvvvv'''def get_fanyi(self,content):url="http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule&smartresult=ugc&sessionFrom=http://www.baidu.com/link"header={}header['User-Agent']='Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36'data={}data['type']='AUTO'data['i']=contentdata['doctype']='json'data['xmlVersion']='1.6'data['keyfrom']='fanyi.web'data['ue']='UTF-8'data['typoResult']='true'data=urllib.parse.urlencode(data).encode('utf-8')req=urllib.request.Request(url,data,header)#req.add_header('User-Agent','Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36')response=urllib.request.urlopen(req)html=response.read().decode('utf-8')#解码成uicodetarget=json.loads(html)#print('翻译的结果:%s'%(target['translateResult'][0][0]['tgt'])) #最近有道网页有改动,返回的json多加了几项内容if 'smartResult' in target:result=''.join(target['smartResult']['entries'])try:print('%s翻译的结果:%s'%(content,result))finally:return resultelse:return Falsedef main():print('#' * 50)txt1=Txt()txt1.deal_txt()#sql2=sql() ########测试数据库结构 #sql2.get_structure()if __name__ == '__main__':main()
0 0
- 采集文献不认识单词上传扇贝网
- 碰到的不认识单词
- [python爬虫]模拟登陆扇贝单词
- 仿扇贝单词----自定义可拖拽控件
- 记英语单词 扇贝网
- 扇贝网面试经历
- 让扇贝单词书中的单词可以发音
- 【扇贝批量添加单词到词库】利用python调用扇贝API (oauth2)
- blender里不认识的单词(2)
- 今日扇贝英语学习难记单词继续努力
- 体验APP时的必答题—以扇贝单词为例
- Python--处理文献中单词,统计个数
- 平时背单词过程中总结的以前不认识的词
- arduino + esp8266+ ds18b20 采集温度上传贝壳网
- 通过git shell 向github上传文献
- AD采集数据&UDP上传
- 文献
- 文献
- java与c++内存泄露的问题
- Better ELK, 新浪实时日志分析服务进化
- “奔跑吧,牛客“---是男人就下100层
- 【转】蜗牛求职记之华为篇
- leetcode:Add Two Numbers
- 采集文献不认识单词上传扇贝网
- 我的人生我选择
- ACM_暑期计划
- 黑马day18 jquery高级特性&ajax的$.ajax()方法
- 运动估计综述
- Android ListView表单实现多选删除,高度随表单项多少变化
- 直接选择排序法
- hdu 3434 Sequence Adjustment
- 看淡 IT人生的云落云起