统计SQuAD的词汇得到word2id 并把词都转成id的python代码

来源:互联网 发布:linux下启动tomcat服务 编辑:程序博客网 时间:2024/06/16 20:04
import jsonimport collectionsjson_file = open("train-v1.1.json")data = json.load(json_file)all_words = []for paragraphs_title in data["data"]:    all_words.extend(paragraphs_title["title"].split())    paragraphs = paragraphs_title["paragraphs"]    for context_qas in paragraphs:        all_words.extend(context_qas["context"].split())        qas = context_qas["qas"]        for answers_question in qas:            answers = answers_question["answers"]            all_words.extend(answers_question["question"].split())            if len(answers)>1:                print(answers)            for answerstart_text in answers:                all_words.extend(answerstart_text["text"].split())counter = collections.Counter(all_words)count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))words, _ = list(zip(*count_pairs))word_to_id = dict(zip(words, range(len(words))))data_vec = []for paragraphs_title in data["data"]:    title = paragraphs_title["title"]    paragraphs = paragraphs_title["paragraphs"]    paragraphs_title = []    data_vec.append(paragraphs_title)    for context_qas in paragraphs:        paragraphs_vec = []        paragraphs_title.append(paragraphs_vec)        context_vec = []        questions_answers = []        paragraphs_vec.append(context_vec)        paragraphs_vec.append(questions_answers)        for word in context_qas["context"].split():            context_vec.append(word_to_id[word])        qas = context_qas["qas"]        for answers_question in qas:            question_answer = []            questions_answers.append(question_answer)            question_vec = []            answer_vec = []            question_answer.append(question_vec)            question_answer.append(answer_vec)            answers = answers_question["answers"]            for word in answers[0]["text"].split():                answer_vec.append(word_to_id[word])            for word in answers_question["question"].split():                question_vec.append(word_to_id[word])print("!")
原创粉丝点击