python汉译英-调用Google翻译
来源:互联网 发布:再向虎山行gotv源码ts 编辑:程序博客网 时间:2024/05/21 06:38
下文中很多接口已经没有用了当前有用的接口是http://translate.google.com/translate_a/t?client=t&hl=en&sl=en&tl=zh-CN&ie=UTF-8&oe=UTF-8&multires=1&prev=conf&psl=en&ptl=en&otf=1&it=sel.2016&ssel=0&tsel=0&prev=enter&oc=3&ssel=0&tsel=0&sc=1&text=hello%20world
也即方法四
一:使用正则表达式提取翻译结果
# -*- coding: utf-8 -*-__author__ = "zlingh" __date__ = "Date: 2014/01/28" import reimport urllib,urllib2#urllib:#urllib2: The urllib2 module defines functions and classes which help in opening#URLs (mostly HTTP) in a complex world — basic and digest authentication,#redirections, cookies and more.def translate(text): '''模拟浏览器的行为,向Google Translate的主页发送数据,然后抓取翻译结果 ''' #text 输入要翻译的英文句子 text_1=text #values={'hl':'zh-CN','ie':'UTF-8','text':text_1,'langpair':"'en'|'zh-CN'"} #'langpair':'en'|'zh-CN'从简体中文英语 values={'hl':'en','ie':'UTF-8','text':text_1,'langpair':"'zh-CN'|'en'"} url='http://translate.google.cn/translate_t' data = urllib.urlencode(values) req = urllib2.Request(url,data) #模拟一个浏览器 browser='Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)' req.add_header('User-Agent',browser) #向谷歌翻译发送请求 response = urllib2.urlopen(req) #读取返回页面 html=response.read() #从返回页面中过滤出翻译后的文本 #使用正则表达式匹配 #翻译后的文本是'TRANSLATED_TEXT='等号后面的内容 #.*? non-greedy or minimal fashion #(?<=...)Matches if the current position in the string is preceded #by a match for ... that ends at the current position p=re.compile(r"(?<=TRANSLATED_TEXT=).*?;") m=p.search(html) text_2=m.group(0).strip(';') #open('tmpp.txt','w').write(html) return text_2if __name__ == "__main__": #text_1 从文件的读取 #text_1=open('c:\\text.txt','r').read() text_1='北京天安门'+','+'故宫'+','+'长城'+','+'社会主义' #text_1='速度' print('%s' % text_1.decode('utf8')) text_2=translate(text_1).strip("'") print('%s' % text_2.decode('utf8'))
二:利用xpath提取
在批量抓取网页内容时,我经常采用的做法是:1、得到目标内容在网页中的位置,即xpath路径;2、批量下载网页,然后利用xpath,取出每个网页中所需要的内容。
在这里,我们利用python模块lxml。
以谷歌翻译为例,我要批量抓取翻译内容,那么首先我要知道译文的xpath,代码如下:
- import urllib,urllib2
- import lxml
- import lxml.html as HTML
- import lxml.etree as etree
- #设置url参数
- lin = 'en'
- lout = 'zh-CN'
- text = 'my apple 123'
- values = {'hl':'zh-CN', 'ie':'UTF-8', 'text':text, 'sl':lin, 'tl':lout}
- url = 'http://translate.google.cn/translate_t'
- data = urllib.urlencode(values)
- req = urllib2.Request(url, data)
- req.add_header('User-Agent', "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)")
- response = urllib2.urlopen(req, timeout = 10)
- shtml = response.read()
- response.close()
- hdoc = HTML.fromstring(shtml)
- htree = etree.ElementTree(hdoc)
- #依次打印出hdoc每个元素的文本内容和xpath路径
- for t in hdoc.iter():
- print htree.getpath(t)
- print t.text_content()
- raw_input()
运行这段代码,发现译文“我的苹果123”的xpath为“/html/body/div[2]/div[2]/div[2]/div/div/div[2]/div”。
现在可以利用xpath取出译文内容。以下方法接受英文原文,然后调用google translate,返回中文译文。代码如下:
- # -*- coding:utf-8 -*-
- import urllib,urllib2
- import lxml
- import lxml.html as HTML
- import lxml.etree as etree
- def g_trans(str_text):
- lin = 'en'
- lout = 'zh-CN'
- values = {'hl':'zh-CN', 'ie':'UTF-8', 'text':str_text, 'sl':lin, 'tl':lout}
- url = 'http://translate.google.cn/translate_t'
- data = urllib.urlencode(values)
- req = urllib2.Request(url, data)
- req.add_header('User-Agent', "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)")
- response = urllib2.urlopen(req, timeout = 10)
- htree = HTML.parse(response)
- response.close()
- #注意,此处返回的是一个list
- emts = htree.xpath('/html/body/div[2]/div[2]/div[2]/div/div/div[2]/div')
- return emts[0].text_content()
三:解析div标签提取结果
import urllib,urllib2import timefrom sgmllib import SGMLParserclass URLLister(SGMLParser): def __init__(self, result): SGMLParser.__init__(self) self.result = result self.open = False def start_div(self, attrs): id = [v for k, v in attrs if k=='id'] if 'tts_button' in id: self.open = True def handle_data(self, text): if self.open: self.result.append(text) self.open = Falsedef Translate(text, f, t): MySentence = [] values = {'hl':'%s'%t,'ie':'UTF-8','text':text,'langpair':"%s|%s"%(f, t)} url = 'http://translate.google.cn/translate_t' data = urllib.urlencode(values) req = urllib2.Request(url, data) req.add_header('User-Agent', "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)") response = urllib2.urlopen(req) parser = URLLister(MySentence) parser.feed(response.read()) parser.close() return MySentence def TranlateForIgnorException(text): excpCnt = 0 while 1: try: arStr = Translate(langStr, "en", "zh-CN")[0] break except: excpCnt = excpCnt + 1 if excpCnt > 10: break time.sleep(2) return arStrif __name__ == "__main__": #ArStr = TranlateForIgnorException("This") b='你好' c='hello' a=Translate(c,'en','zh-CN') print a[0].decode('utf8')
另外:上面提取网页不好发现的话,通过:http://translate.google.cn/?hl=en#zh-CN/en/饼干,网页来提取也可以。
四:通过http://translate.google.com/translate_a/t?client=z&hl=en&sl=en&tl=zh-CN&ie=UTF-8&oe=UTF-8&text=hello+world(管用)
# -*- coding: utf-8 -*-__author__ = "zlingh" __date__ = "Date: 2014/01/28" import reimport urllib,urllib2def translate(text): '''模拟浏览器的行为,向Google Translate的主页发送数据,然后抓取翻译结果 ''' #text 输入要翻译的英文句子 text_1=text values={'client':'z','tl': 'en','sl': 'zh-CN','ie':'UTF-8','oe': 'UTF-8','text':text_1} url='http://translate.google.com/translate_a/t' data = urllib.urlencode(values) req = urllib2.Request(url,data) #模拟一个浏览器 browser='Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)' req.add_header('User-Agent',browser) #向谷歌翻译发送请求 response = urllib2.urlopen(req) #读取返回页面 html=response.read() print html dt = eval(html) return dt["sentences"][0]["trans"]if __name__ == "__main__": #text_1 从文件的读取 #text_1=open('c:\\text.txt','r').read() #text_1='故宫'+','+'长城'+','+'社会主义' text_1='速度' text_2=translate(text_1) print text_2.decode('utf8')
五:翻译文件
# -*- coding: utf-8 -*-__author__ = "zhangping" __date__ = "Date: 2014/05/12" import reimport urllib,urllib2import jsonimport functoolsdef translate(text, src, dest): '''模拟浏览器的行为,向Google Translate的主页发送数据,然后抓取翻译结果 ''' #text 输入要翻译的英文句子 text_1=text #values={'client':'z','sl':'en','tl': 'zh-CN','ie':'UTF-8','oe': 'UTF-8','text':text_1} values={'client':'z','sl': src,'tl': dest,'ie':'UTF-8','oe': 'UTF-8','text':text_1} url='http://translate.google.com/translate_a/t' data = urllib.urlencode(values) req = urllib2.Request(url,data) #模拟一个浏览器 browser='Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)' req.add_header('User-Agent',browser) #向谷歌翻译发送请求 response = urllib2.urlopen(req) #读取返回页面 html=response.read() #print html dt = json.loads(html) rel=[l["trans"].strip('\n') for l in dt["sentences"]] return reldef f2w(words, f2): for w in words: w = w.strip(' ') f2.write(w.encode('utf8')+'\n') if __name__ == "__main__": en2cn = functools.partial(translate, src='en',dest='zh-cn') cn2en = functools.partial(translate, src='zh-cn',dest='en') f1 = open('nr.ch','r') f2 = open('trans','w') h = 0 words_cn ='' for line in f1: tmp = line words_cn+=tmp h+=1 if(h%300==0): print h words_en=cn2en(words_cn) f2w(words_en,f2) words_cn='' words_en=cn2en(words_cn) f2w(words_en,f2) f1.close() f2.close()
下面是网上一个个很牛的项目,很方便,但是中文翻译成英文好像有点问题,我没有调试出来:
Goslate 免费谷歌翻译
http://zhuoqiang.me/goslate-free-google-translate-api.html
0 0
- python汉译英-调用Google翻译
- 小玩意系列:Python调用Google翻译
- python 调用Google Translate API进行翻译
- python 调用Google Translate API 翻译
- 调用google全文翻译
- 调用google翻译
- 调用GOOGLE翻译
- java 调用google翻译
- python3调用Google翻译
- google 翻译python版本
- python google翻译
- python3.2调用google翻译
- 调用Google翻译 语音接口
- 调用Google翻译 语音接口
- Python 调用 百度翻译
- python调用百度翻译
- [Python]Google翻译小程序
- MAC - 使用 Alfred 调用 Google 翻译
- 解决Sublime Text 3中文显示乱码问题
- do {...} while (0) 在宏定义中的作用
- NBUT 1538 Submartix
- 在潛移默化里··········
- MapReduce的Join
- python汉译英-调用Google翻译
- iOS Xcode工程目录的 folder 和 group的区别(蓝色和黄色文件夹的区别)
- nginx FastCGI模块(FastCGI)配置
- nginx反向代理服务器端口问题
- Xcode5的使用技巧和快捷键
- 嗖,2013-我早已出发
- google中有意思的事情
- jquery实现input输入框实时输入触发事件
- Ubuntu10.04下安装ffmpeg(续)