采集文献不认识单词上传扇贝网

来源：互联网发布：免费网络摄像头直播编辑：程序博客网时间：2024/04/30 03:07
程序分为三个功能块

1、sql() 数据库功能块，这个是下载网上源代码，自己加了些东西，其源码来源于此处点击打开链接。

2、txt()文本处理模块，搜寻单词，对比本地数据库，不在本地数据库中单词，去有道翻译，得到翻译，去掉人名，地名，或者短语，写入到数据库，并写入到txt文件（上传用）。

3、url()翻译模块。

另外做了数据库，从网上下载的单词txt文件，转入数据库中，库中表包括四级和六级。
#!/usr/bin/python#-*- coding: utf-8 -*-import osimport sysimport sqlite3import timeimport codecsimport urllib.requestimport urllib.parseimport jsonclass sql(object):'''test.db中有三个表，第一个是cet_4， 第二个是conquer_word， 第三个是temp, 第四个cet_6id                        integer          integer                integer       integeren                         text            text                   text           text cn                         text            real                   text            text add_time                   text             text                   text          textconquer                    INT              int                    int           intclassify                   ..              ..                     ..             text  '''def __init__(self):'''初始化方法'''#数据库文件绝句路径global DB_FILE_PATHDB_FILE_PATH = os.getcwd()+'\\test.db'#数据库表名称global TABLE_NAMETABLE_NAME = 'cet_4'#是否打印sqlglobal SHOW_SQLSHOW_SQL = Trueprint('show_sql : {}'.format(SHOW_SQL))'''#self.conn=sqlite3.connect(path)#test.db####只能初始调用，本打算去掉get_conn功能#self.cursor=self.conn.cursor()'''def get_conn(self,path):'''获取到数据库的连接对象，参数为数据库文件的绝对路径如果传递的参数是存在，并且是文件，那么就返回硬盘上面改路径下的数据库文件的连接对象'''conn = sqlite3.connect(path)#test.dbif os.path.exists(path) and os.path.isfile(path):return conndef get_cursor(self,conn):'''该方法是获取数据库的游标对象，参数为数据库的连接对象如果数据库的连接对象不为None，则返回数据库连接对象所创建的游标对象；否则返回一个游标对象，该对象是内存中数据库连接对象所创建的游标对象'''if conn is not None:return conn.cursor()else:return  self.get_conn('').cursor()def create_table(self, sql):'''创建数据库表：'''if sql is not None and sql != '':conn=self.get_conn(DB_FILE_PATH)cu = self.get_cursor(conn)if SHOW_SQL:print('执行sql:[{}]'.format(sql))cu.execute(sql)conn.commit()print('创建数据库表[]成功!')self.close_all(conn, cu)else:print('the [{}] is empty or equal None!'.format(sql))def drop_table(self,table):'''如果表存在,则删除表，如果表中存在数据的时候，使用该方法的时候要慎用！'''if table is not None and table != '':sql = 'DROP TABLE IF EXISTS ' + tableif SHOW_SQL:print('执行sql:[{}]'.format(sql))conn=self.get_conn(DB_FILE_PATH)cu = self.get_cursor(conn)cu.execute(sql)conn.commit()print('删除数据库表[{}]成功!'.format(table))self.close_all(conn, cu)else:print('the [{}] is empty or equal None!'.format(sql)) def close_all(self,conn,cu):'''关闭数据库游标对象和数据库连接对象'''try:if cu is not None:cu.close()    finally:if conn is not None:conn.close() def save(self, sql, data):'''插入数据'''if sql is not None and sql != '':if data is not None:conn=self.get_conn(DB_FILE_PATH)cu = self.get_cursor(conn)for d in data:if SHOW_SQL:print('执行sql:[{}],参数:[{}]'.format(sql, d))cu.execute(sql, d)conn.commit()self.close_all(conn, cu)else:print('the [{}] is empty or equal None!'.format(sql))def fetchone(self,sql, data):'''查询一条数据'''    #cursor = conn.execute("SELECT * from cet_4 where cet_4.en like ?",('ab%',))模糊查询语句if sql is not None and sql != '':conn=self.get_conn(DB_FILE_PATH)cu = self.get_cursor(conn)if data is not None and data!='':d = (data,) if SHOW_SQL:print('执行sql:[{}],参数:[{}]'.format(sql, data))cu.execute(sql, d)r = cu.fetchall()if len(r) > 0:return True##########################for e in range(len(r)):#   print(r[e])#########################   else:return Falseprint('the [{}] equal None!'.format(data))else:print('执行sql:[{}]'.format(sql))cu.execute(sql)r = cu.fetchall()if len(r)>0:return len(r)else:return 0#print('the [{}] is empty or equal None!'.format(sql))def update(self,sql, data):'''更新数据'''if sql is not None and sql != '':if data is not None:conn=self.get_conn(DB_FILE_PATH)cu = self.get_cursor(conn)for d in data:if SHOW_SQL:print('执行sql:[{}],参数:[{}]'.format(sql, d))cu.execute(sql, d)conn.commit()self.close_all(conn, cu)else:print('the [{}] is empty or equal None!'.format(sql))def delete(self,sql, data):'''删除数据'''if sql is not None and sql != '':if data is not None:conn=self.get_conn(DB_FILE_PATH)cu = self.get_cursor(conn)for d in data:if SHOW_SQL:print('执行sql:[{}],参数:[{}]'.format(sql, d))cu.execute(sql, d)conn.commit()self.close_all(conn, cu)else:print('the [{}] is empty or equal None!'.format(sql))def get_structure(self):'''显示数据库结构'''conn=self.get_conn(DB_FILE_PATH)cu = self.get_cursor(conn)#c.execute("DROP TABLE IF EXISTS temp;")cu.execute("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name")print(cu.fetchall())####查看表的结构##cu.execute("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name")for i in cu.fetchall():cu.execute("PRAGMA table_info({})".format(i[0]))                          ##print(cu.fetchall()) #删除表中数据#cu.execute("DELETE FROM temp ") #conn.commit()##查看表中数据##cu.execute('SELECT * FROM temp')row=cu.fetchall()for j in row:try:print (j)except:pass##模糊查询##for table in ['cet_4','cet_6','temp']:cu.execute("SELECT en from [{}] where [{}].en like?".format(table,table),('%appl%',))   #cu.execute("SELECT * from cet_4 where cet_4.en like ?",('%appl%',))#模糊查询语句 row=cu.fetchall()for l in row:print(table,l)#######################################################################  c.execute("PRAGMA table_info(cet_4)")                          ####  print(c.fetchall())                    ########查看表的结构    ## #####################################################################class Txt(object):"""处理相关txt数据"""def __init__(self):'''vvvv'''def deal_txt(self):temp_dic=[]dict_account={}num=0 #每篇文章计算个数path="C:\\Users\\John\\Desktop\\work\\paper"for f in os.listdir(path):if os.path.isfile(path+"\\"+f) and '.txt' in f :file_name=path+"\\"+felse:continuewith codecs.open (file_name,encoding = "utf-8")  as ff:  #避免编码的问题s=ff.read()ff.close()dele=[' ','!','%','&','*','(',')',');','[',']','{','}','\\','|','/','//','?',':','"','“','”',':','：',';','；',',','，','.','。','`','·','~','-','-','_','+','=','——','-']#尽可能的去除符号for d in dele:s=s.strip(d)ls=s.split(' ')ls=ls[1:] #ls[0]包含'u/ffff'字符，去除for i in ls :for d in dele:i=i.strip(d)#尽可能去除非单词部分if ' ' in i or len(i)<=4:continuev={'ing':'','ies':'y','tions':'tion','\'s':'','ied':'y','ed':'','s':''}for j in ['ing','ied','ies','tions','ed','\'s','s']:if i[-len(j):]==j:i=i.replace(j,v[j])break #跳出内循环if len(i)<=5 or len(i)>10 or 'http:' in i or '’s' in i or not i.isalpha() :continuetry:print(i)except:continueif i.lower() not in temp_dic:temp_dic.append(i.lower())if i.lower() not in dict_account:time.sleep(0.000001)dict_account[i.lower()]=1else:time.sleep(0.000001)dict_account[i.lower()]=dict_account[i.lower()]+1s=''ls=[]num=-(num-len(temp_dic))print(file_name,"\n","本篇文章符合要求单词个数是:%d"%num,len(temp_dic))      print (len(temp_dic))print(sorted(dict_account.items(),key =lambda a:a[1],reverse=True))return temp_dic   def save_txt(self):sql1=sql()temp_dic=self.deal_txt()translate=url()temp_word_list=[]da=''j=sql1.fetchone('SELECT * FROM temp',da)+1with open (os.getcwd()+"\\"+time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime())+".txt",'w') as f:for i in temp_dic:#基本数据库中存在则continue cet_4.en like ?",('ab%',))boolean=Falsefor table in ['cet_4','cet_6','temp']:boolean= boolean or sql1.fetchone("SELECT en from [{}] where [{}].en like?".format(table,table),i+'%')if boolean:print("%s数据库中存在%s！或者存在其变形"%(table,i))break if boolean: continueelse:result=translate.get_fanyi(i)if result==False or result.isalnum() or '姓氏' in result or  '人名' in result or '女子名' in result or '男子名' in result :print('不是单词，是短语或者其他')continueelse:try:f.writelines(i+'\n')temp_word_list.append((i,result))sq="INSERT INTO temp  VALUES (?,?,?,?,?)"add_time= time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) data=[(j,i,result,add_time,1)]sql1.save(sq, data)j+=1except:passreturn temp_word_listclass url(object):"""从网页有道出获取中文"""def __init__(self):'''vvvvv'''def get_fanyi(self,content):url="http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule&smartresult=ugc&sessionFrom=http://www.baidu.com/link"header={}header['User-Agent']='Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36'data={}data['type']='AUTO'data['i']=contentdata['doctype']='json'data['xmlVersion']='1.6'data['keyfrom']='fanyi.web'data['ue']='UTF-8'data['typoResult']='true'data=urllib.parse.urlencode(data).encode('utf-8')req=urllib.request.Request(url,data,header)#req.add_header('User-Agent','Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36')response=urllib.request.urlopen(req)html=response.read().decode('utf-8')#解码成uicodetarget=json.loads(html)#print('翻译的结果：%s'%(target['translateResult'][0][0]['tgt'])) #最近有道网页有改动，返回的json多加了几项内容if 'smartResult' in target:result=''.join(target['smartResult']['entries'])try:print('%s翻译的结果：%s'%(content,result))finally:return resultelse:return Falsedef main():print('#' * 50)txt1=Txt()txt1.deal_txt()#sql2=sql()  ########测试数据库结构 #sql2.get_structure()if __name__ == '__main__':main()
0 0