一些小问题

来源:互联网 发布:vscode 支持xp吗 编辑:程序博客网 时间:2024/05/16 15:16

1,如何在 PyCharm 中设置 Python 代码模板


2,PyCharm 中文注释报错 SyntaxError: Non-ASCII character


3,Ubuntu 安装Navicat,界面出现乱码解决方法


4,Ubuntu navicat导入csv文件失败:多半是字段分隔符按照默认的设定成了“定位”,改成逗号(或者换其他几个选项试试)


5,Gensim Word2vec 使用指南


6,自然语言处理工具包spaCy介绍


7,AttributeError: ‘Word2Vec’ object has no attribute ‘syn0’


8,SQLAlchemy Introduce(mysql与它的数据类型对应问题)


9,keras: texts_to_sequences_generator(texts)

from keras.preprocessing.text import Tokenizertexts=data.x_trainsample_index=0text_list = texts[sample_index][0]   # 这是一个句子列表,里面是unicodetokenizer = Tokenizer(word_num_per_sent)tokenizer.fit_on_texts(text_list)

报错为:

File "/home/sunxiangguo/PycharmProjects/personality/cnn.py", line 85, in <module> tokenizer.fit_on_texts(text_list)File "/home/sunxiangguo/anaconda2/lib/python2.7/site-packages/keras/preprocessing/text.py", line 119, in fit_on_texts self.split)File "/home/sunxiangguo/anaconda2/lib/python2.7/site-packages/keras/preprocessing/text.py", line 38, in text_to_word_sequence text = text.translate(maketrans(filters, split * len(filters)))TypeError: character mapping must return integer, None or unicode

修正:

from keras.preprocessing.text import Tokenizertexts=data.x_trainsample_index=0text_list = texts[sample_index][0]   # 这是一个句子列表,里面是unicodetokenizer = Tokenizer(word_num_per_sent)tokenizer.fit_on_texts([s.encode('ascii') for s in text_list])#tokenizer.fit_on_texts(text_list)

10,编写一个集读取数据库,数据分割与一身的,一劳永逸的数据类

#!/usr/bin/env python# encoding: utf-8"""@version: python2.7@author: Xiangguo Sun@contact: sunxiangguodut@qq.com@site: http://blog.csdn.net/github_36326955@software: PyCharm@file: get_data@time: 17-7-11 下午1:55"""import pandas as pdimport numpy as npfrom sqlalchemy import create_enginefrom sklearn.model_selection import train_test_splitimport jsonclass Data(object):    def __init__(self,big_five='cEXT'):        # 永远不变:        self.engine = create_engine('mysql+pymysql://root:root@localhost:3306/personality_1', echo=True)        self.sample_y5 = self._get_sample_y5()        self.sample_x = self._get_sample_x()        # 可以用户调节        self.big_five = big_five        self.train_size = 0.9        # 用户调节后自动更新的变量        self.sample_y = self._get_sample_y() # only change by big_five        self.x_train=None   # change by big_five and train_size        self.x_test=None   # change by big_five and train_size        self.y_train=None   # change by big_five and train_size        self.y_test=None   # change by big_five and train_size        self.update_train_test()    def details(self):        return {"sample_x": self.sample_x.shape,           "sample_y5": self.sample_y5.shape,           "big_five": self.big_five,           "train_size": self.train_size,           "sample_y": self.sample_y.shape,           "x_train": self.x_train.shape,           "x_test": self.x_test.shape,           "y_train": self.y_train.shape,           "y_test": self.y_test.shape}    def _get_sample_x(self):        df_all = pd.read_sql_table('table_3', self.engine, columns=['line_text'])  # read essays        all_text = df_all['line_text']        sample_x = []        for text in all_text:            # get all_line_text in one text            cut_sentence_list = json.loads(text)  # type:list (from json to list)            sample_x.append(cut_sentence_list)        return np.array(sample_x).reshape((-1, 1))  # shape (2467,1)        # print ("xx:",self.sample_x.shape)    def _get_sample_y5(self):        return pd.read_sql_table('essays', self.engine,                                   columns=['cEXT', 'cNEU', 'cAGR', 'cCON', 'cOPN'])  # read essays    def _get_sample_y(self):        return self.sample_y5[self.big_five].reshape((-1, 1))  # shape (2467,1)    def set_big_five(self,big_five):        """        :param big_five: 'cEXT', 'cNEU', 'cAGR', 'cCON', 'cOPN'        :return:        """        self.big_five=big_five        self.sample_y = self._get_sample_y()        self.update_train_test()    def set_train_size(self,train_size):        self.train_size=train_size        self.update_train_test()    def update_train_test(self):        self.x_train,self.x_test,self.y_train,self.y_test = train_test_split(self.sample_x,self.sample_y,                                                                             random_state=1,train_size=self.train_size)if __name__ =='__main__':    data=Data()    #data = Data()    print (data.details())

11,编写一个一劳永逸的切割句子的类

#!/usr/bin/python# -*- coding:utf8 -*-"""@version: python2.7@author: Xiangguo Sun@contact: sunxiangguodut@qq.com@site: http://blog.csdn.net/github_36326955@software: PyCharm@file: RCNN@time: 17-7-13 上午11:46"""from sqlalchemy import create_engine    # mysql orm interface,better than mysqldbimport pandas as pdimport spacy    # a NLP model like NLTK,but more industrial.import jsondef cut_sentences(df):    all_text_name = df["#AUTHID"]  # type pandas.Series:get all text name(match the "#AUTHID" in essays)    all_text = df["TEXT"]  # type pandas.Series:get all text(match the "TEXT" in essays)    all_number = all_text_name.index[-1]    # from 0 to len(all_text_name)-1    for i in xrange(0,all_number+1,1):        print("start to deal with text ", i ," ...")        text = all_text[i]  # type str:one of text in all_text        text_name = all_text_name[i]    # type str:one of text_name in all_text_name        nlp = spacy.load('en_sm')        test_doc = nlp(text.decode())        cut_sentence = []        for sent in test_doc.sents:     # get each line in the text            cut_sentence.append(sent.text)            """            type sent is spacy.tokens.span.Span, not a string,            so, we call the member function Span.text to get its unicode form            """        cut_sentence_json = json.dumps(cut_sentence)        line_number = len(cut_sentence)        input_data_dic = {'text_name': text_name,                         'line_number':line_number,                         'line_text': cut_sentence_json                         }        input_data = pd.DataFrame(input_data_dic,index=[i],columns=['text_name','line_number','line_text'])        input_data.to_sql('table_3', engine, if_exists='append', index=False, chunksize=100)        """        DataFrame.index will be insert to table by default. We don't want it, so we set the         index = False(True default)        """        print("text ", i ," finished")if __name__ =='__main__':    engine = create_engine('mysql+pymysql://root:root@localhost:3306/personality_1', echo=True)    df = pd.read_sql_table('essays', engine,chunksize=5)  # read essays    for df_iter in df:        cut_sentences(df_iter)

12,编写一个word2vect训练模块

#!/usr/bin/env python# encoding: utf-8"""@version: python2.7@author: Xiangguo Sun@contact: sunxiangguodut@qq.com@site: http://blog.csdn.net/github_36326955@software: PyCharm@file: word2vect@time: 17-7-10 下午5:00"""from sqlalchemy import create_engine    # mysql orm interface,better than mysqldbimport pandas as pdimport spacy    # a NLP model like NLTK,but more industrial.import jsonimport gensimimport datetime, time"""we use gensim.models.Word2Vec(sentences,size=200,min_count=0,workers=4)to train our word vect model without GPUs参数列表:min_count=0 修剪内部字典书树size = 200 神经网络NN层单元数workers = 4 并行粒度alpha=0.025start_time=2017-07-10 19:30end_time=2017-07-11 05:32:13.441757totally_time= about 10 hoursubuntu 16.04 LTS 64bitpython2.7IDE PyCharmmemory: 7.7GBIntel Core i7-4790 CPU @ 3.60Ghz x 8"""start_time = time.strftime("%Y-%m-%d %H:%M:%S")with open("./log.txt",'a') as f:    f.write(str(start_time).decode())class MySentences(object):    def __init__(self,df_generator):        self.df_generator = df_generator    def __iter__(self):        all_text = self.df_generator['line_text']        count = 0        for text in all_text:            count = count+1            now = datetime.datetime.now()            # with open("./log.txt", 'a') as f:            #     f.write((str(now)+"..."+str(count)).decode())            print (str(now) + "..."+str(count))            # get all_line_text in one text            cut_sentence_list = json.loads(text)    # type:list (from json to list)            # step1:首先对每一句话做分词操作,去掉标点符号            # for example:            # "hello , there."->['hello','there']            # 得到句子序列            # for example            # sentence1: "hello , there."            # sentence2: "I'm fine, thanks"            # we should get: [['hello','there'],["I'm","fine","thanks"]]            nlp = spacy.load('en_sm')            stop_word_pos = ["PUNCT", "SPACE", "DET", "ADP"]            """            "PUNCT":标点            "SPACE":空格            "DET":the            "ADP": 介词            """            #sentences=[]            for sentence in cut_sentence_list:                sent=[]                text_doc = nlp(sentence.decode())                for token in text_doc:                    if token.pos_ not in stop_word_pos:                        sent.append(token.text)     # type: token.text unicode                #sentences.append(sent)                yield(sent)                #print(sentences)engine = create_engine('mysql+pymysql://root:root@localhost:3306/personality_1', echo=True)df_all = pd.read_sql_table('table_3', engine)  # read essayssentences=MySentences(df_all)model = gensim.models.Word2Vec(sentences,size=200,min_count=0,workers=4)"""参数列表:min_count=0 修剪内部字典书树size = 200 神经网络NN层单元数workers = 4 并行粒度alpha=0.025"""path = "./mymodel"model.save(path)# # 载入模型语句为:# new_model = gensim.models.Word2Vec.load(path)# #print (model.)# print(new_model.similarity("now", "here"))

13 Error when checking model target: expected activation_2 to have shape (None, 10) but got array with shape (3, 1)

X_train = np.array([[1,2], [6,5], [8,2]])y_train = np.array([2,3,7])input_dim = X_train.shape[1]model = Sequential()model.add(Dense(output_dim=64, input_dim=input_dim))model.add(Activation("relu"))model.add(Dense(output_dim=10))model.add(Activation("softmax"))model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])model.fit(X_train, y_train, nb_epoch=5, batch_size=32)

修正:

I used sparse_categorical_crossentropy to solve my problem

14,如何在终端中运行你的django项目
如果你的django项目是在虚拟环境中开发的,那么,在终端运行时,一定要进入虚拟环境中运行,如下图:

sunxiangguo@sunxiangguo-ubuntu:~/personality_web/bin$ source ./activate(personality_web) sunxiangguo@sunxiangguo-ubuntu:~/personality_web/bin$ cd /home/sunxiangguo/PycharmProjects/personality_web(personality_web) sunxiangguo@sunxiangguo-ubuntu:~/PycharmProjects/personality_web$ python manage.py runserverPerforming system checks...System check identified no issues (0 silenced).July 25, 2017 - 11:43:50Django version 1.9.13, using settings 'personality_web.settings'Starting development server at http://127.0.0.1:8000/Quit the server with CONTROL-C.
原创粉丝点击