NLP jieba
来源:互联网 发布:mac下安装启动盘 编辑:程序博客网 时间:2024/06/08 16:23
In [1]:
# encoding=utf-8import jiebaseg_list = jieba.cut("我在学习自然语言处理", cut_all=True)print seg_listprint("Full Mode: " + "/ ".join(seg_list)) # 全模式seg_list = jieba.cut("我在学习自然语言处理", cut_all=False)print("Default Mode: " + "/ ".join(seg_list)) # 精确模式seg_list = jieba.cut("他毕业于上海交通大学,在百度深度学习研究院进行研究") # 默认是精确模式print(", ".join(seg_list))seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在哈佛大学深造") # 搜索引擎模式print(", ".join(seg_list))
In [2]:
result_lcut = jieba.lcut("小明硕士毕业于中国科学院计算所,后在哈佛大学深造")print result_lcutprint " ".join(result_lcut)print " ".join(jieba.lcut_for_search("小明硕士毕业于中国科学院计算所,后在哈佛大学深造"))
In [3]:
print('/'.join(jieba.cut('如果放到旧字典中将出错。', HMM=False)))
In [4]:
jieba.suggest_freq(('中', '将'), True)
Out[4]:
In [5]:
print('/'.join(jieba.cut('如果放到旧字典中将出错。', HMM=False)))
In [6]:
import jieba.analyse as analyselines = open('NBA.txt').read()print " ".join(analyse.extract_tags(lines, topK=20, withWeight=False, allowPOS=()))
In [7]:
lines = open(u'西游记.txt').read()print " ".join(analyse.extract_tags(lines, topK=20, withWeight=False, allowPOS=()))
In [8]:
import jieba.analyse as analyselines = open('NBA.txt').read()print " ".join(analyse.textrank(lines, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v')))print "---------------------我是分割线----------------"print " ".join(analyse.textrank(lines, topK=20, withWeight=False, allowPOS=('ns', 'n')))
In [9]:
lines = open(u'西游记.txt').read()print " ".join(analyse.textrank(lines, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v')))
In [10]:
import jieba.posseg as psegwords = pseg.cut("我爱自然语言处理")for word, flag in words: print('%s %s' % (word, flag))
In [11]:
import sysimport timeimport jiebajieba.enable_parallel()content = open(u'西游记.txt',"r").read()t1 = time.time()words = "/ ".join(jieba.cut(content))t2 = time.time()tm_cost = t2-t1print('并行分词速度为 %s bytes/second' % (len(content)/tm_cost))jieba.disable_parallel()content = open(u'西游记.txt',"r").read()t1 = time.time()words = "/ ".join(jieba.cut(content))t2 = time.time()tm_cost = t2-t1print('非并行分词速度为 %s bytes/second' % (len(content)/tm_cost))
In [12]:
print "这是默认模式的tokenize"result = jieba.tokenize(u'自然语言处理非常有用')for tk in result: print("%s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))print "\n-----------我是神奇的分割线------------\n"print "这是搜索模式的tokenize"result = jieba.tokenize(u'自然语言处理非常有用', mode='search')for tk in result: print("%s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
In [16]:
# -*- coding: UTF-8 -*-from __future__ import unicode_literalsimport sys,ossys.path.append("../")from whoosh.index import create_in,open_dirfrom whoosh.fields import *from whoosh.qparser import QueryParseranalyzer = jieba.analyse.ChineseAnalyzer()schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True, analyzer=analyzer)) if not os.path.exists("tmp"): os.mkdir("tmp")ix = create_in("tmp", schema) # for create new index#ix = open_dir("tmp") # for read onlywriter = ix.writer()writer.add_document( title="document1", path="/a", content="This is the first document we’ve added!")writer.add_document( title="document2", path="/b", content="The second one 你 中文测试中文 is even more interesting! 吃水果")writer.add_document( title="document3", path="/c", content="买水果然后来世博园。")writer.add_document( title="document4", path="/c", content="工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作")writer.add_document( title="document4", path="/c", content="咱俩交换一下吧。")writer.commit()searcher = ix.searcher()parser = QueryParser("content", schema=ix.schema)for keyword in ("水果世博园","你","first","中文","交换机","交换"): print(keyword+"的结果为如下:") q = parser.parse(keyword) results = searcher.search(q) for hit in results: print(hit.highlights("content")) print("\n--------------我是神奇的分割线--------------\n")for t in analyzer("我的好朋友是李明;我爱北京天安门;IBM和Microsoft; I have a dream. this is intetesting and interested me a lot"): print(t.text)
In [ ]:
阅读全文
1 0
- NLP jieba
- nlp技能,jieba分词
- 【python 走进NLP】 NLP 使用jieba分词处理文本
- NLP中jieba分词的用法(一)
- jieba
- jieba
- jieba
- jieba
- NLP
- nlp
- NLP
- NLP
- NLP
- NLP
- NLP
- NLP
- jieba分词
- jieba分词
- shell 完成快速排序
- UEditor后端配置项说明
- 题目169-素数
- 配置依赖反射设置注入
- 四边形优化DP CCF 201612-04权压缩编码
- NLP jieba
- 单链表归并排序java
- Python + Splinter 实现自动化登录第一步中遇到的问题
- SQL语句优化技巧
- UEditor完整配置项
- 百家讲坛 王立群读史记之汉武帝
- 拓展Log4j基本尝试.md
- git 命令小记
- JavaScript中九九乘法表制作