当前位置: 首页 > 工具软件 > Word2VEC_Java > 使用案例 >

word2vec——训练自己的word2vec模型

马博学
2023-12-01

数据集:data/souhu下面的所有文件夹的所有txt文件

代码:

import os
import re
import sys
import jieba
import torch
from gensim.models import Word2Vec, word2vec
import numpy as np
import jieba.analyse
import matplotlib.pyplot as plt
from tqdm import tqdm
import pickle

plt.rcParams['font.sans-serif'] = ['SimHei']#用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False#用来正常显示负号

#固定随机数
np.random.seed(100)
torch.cuda.manual_seed(100)
sys.stdout.flush()
nFile = 200
root_path = "datasets/souhu"
class_list = os.listdir(root_path)
all_word_list = []
for c in class_list:
    class_path =root_path + "/" + c
    file_list = os.listdir(class_path)
    for name in file_list:
        file_path = class_path+"/" +name
        with open(file_path, "r", encoding='utf-8') as f:
            txt = f.read()
            txt = re.sub("[    \t\n]*", "", txt)
            word_list = jieba.analyse.textrank(txt, topK = None, withWeight = False, allowPOS=('ns', 'n', 'vn', 'v'))
            all_word_list.extend(word_list)
result = " ".join(all_word_list)
with open("result.txt", "w", encoding="utf-8") as f:
    f.write(result)
f.close()
sentences = word2vec.Text8Corpus("result.txt")#加载语料
model = word2vec.Word2Vec(sentences, vector_size = 250, min_count=1) #训练模型,维度设置为200,最小词频设置为1;
model.save("my_model.model")
with open("all_word_list.pkl", "wb") as f:
    pickle.dump(all_word_list, f)
f.close()

model = Word2Vec.load("my_model.model")
with open("all_word_list.pkl", 'rb') as f:
    all_word_list = pickle.load(f)
f.close()
 

用新生成的word2vec计算与输入单词最相近的单词:

用新生成的word2vec计算输入单词中最不同的:

# coding=utf-8
from gensim.models import Word2Vec

en_wiki_wor2vec_model=Word2Vec.load('my_model.model')

testwords=['苹果','数学','电影']
for i in range(3):
    res=en_wiki_wor2vec_model.wv.most_similar(testwords[i])
    print(testwords[i])
    print(res)

testwords=['济南','北京','叶蓁蓁']
re1=en_wiki_wor2vec_model.wv.doesnt_match(testwords)
print(re1)

备注:一般是用维基百科的语料库进行word2vec的训练

 类似资料: