中文维基百科地址:https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2;
搜狗全网新闻预料地址:http://www.sogou.com/labs/resource/ca.php;
中文维基百科是xml格式的压缩文件,有1G左右。下面仅以中文维基预料进行模型训练。
1.将xml文件转为text。执行命令:python process_wiki_data.py zhwiki-latest-pages-articles.xml.bz2 wiki.zh.text
# process_wiki_data.py
# -*- coding: utf-8 -*-
import logging
import os.path
import sys
from gensim.corpora import WikiCorpus
if __name__ == '__main__':
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info("running %s" % ' '.join(sys.argv))
# check and process input arguments
if len(sys.argv) < 3:
print(globals()['__doc__'] % locals())
sys.exit(1)
inp, outp = sys.argv[1:3]
space = " "
i = 0
output = open(outp, 'w', encoding='utf-8')
wiki = WikiCorpus(inp, lemmatize=False, dictionary={})
for text in wiki.get_texts():
output.write(space.join(text) + "\n")
i = i + 1
if (i % 10000 == 0):
logger.info("Saved " + str(i) + " articles")
output.close()
logger.info("Finished Saved " + str(i) + " articles")
``
![处理数据结果](https://img-blog.csdnimg.cn/20190806175953398.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3hpZXppXzEwMTU=,size_16,color_FFFFFF,t_70)
2.对变换格式后的预料进行分词处理(中文分词以jieba为例)命令:python word_seg.py
```javascript
# -*- coding: utf-8 -*-
# 逐行读取文件数据进行jieba分词
# word_seg.py
import jieba
import jieba.analyse
import jieba.posseg as pseg #引入词性标注接口
import codecs,sys
if __name__ == '__main__':
f = codecs.open('zhwiki.zh.text', 'r', encoding='utf8')
target = codecs.open('wiki.zh.text.seg', 'w', encoding='utf8')
print('open files.')
lineNum = 1
line = f.readline()
print("打印line:", line)
while line:
print('---processing ',lineNum,' article---')
seg_list = jieba.cut(line,cut_all=False)
line_seg = ' '.join(seg_list)
print("分词", line_seg)
target.writelines(line_seg)
lineNum = lineNum + 1
line = f.readline()
print('well done.')
f.close()
target.close()
3.使用分割后的预料进行模型的训练,命令:python word2vecmodel.py wiki.zh.text.seg wiki.zh.text.model wiki.zh.text.vector
# -*- coding: utf-8 -*-
# word2vecmodel.py用于训练模型
import logging
import os.path
import sys
import multiprocessing
from gensim.corpora import WikiCorpus
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
if __name__ == '__main__':
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info("running %s" % ' '.join(sys.argv))
# check and process input arguments
if len(sys.argv) < 4:
print(globals()['__doc__'] % locals())
sys.exit(1)
inp, outp1, outp2 = sys.argv[1:4]
model = Word2Vec(LineSentence(inp), size=400, window=5, min_count=5,
workers=multiprocessing.cpu_count())
# trim unneeded model memory = use(much) less RAM
#model.init_sims(replace=True)
model.save(outp1)
model.save_word2vec_format(outp2, binary=False)
4.测试模型效果,命令:python testmodel.py
# coding: utf-8
# testmodel.py
import gensim
model = gensim.models.Word2Vec.load("wiki.zh.text.model")
model.init_sims(replace = True)
result = model.most_similar(u"足球")
for e in result:
print(e[0], e[1])
result = model.most_similar(u"男人")
for e in result:
print(e[0], e[1])
result = model.most_similar(u"女人")
for e in result:
print(e[0], e[1])
result = model.most_similar(u"青蛙")
for e in result:
print(e[0], e[1])
result = model.most_similar(u"姨夫")
for e in result:
print(e[0], e[1])
result = model.most_similar(u"衣服")
for e in result:
print(e[0], e[1])
result = model.most_similar(u"公安局")
for e in result:
print(e[0], e[1])
result = model.most_similar(u"铁道部")
for e in result:
print(e[0], e[1])
result = model.most_similar(u"清华大学")
for e in result:
print(e[0], e[1])
result = model.most_similar(u"卫视")
for e in result:
print(e[0], e[1])
result = model.most_similar(u"语言学")
for e in result:
print(e[0], e[1])
result = model.most_similar(u"计算机")
for e in result:
print(e[0], e[1])
model.similarity(u"计算机", u"自动化")
model.similarity(u"女人", u"男人")
model.doesnt_match(u"早餐 晚餐 午餐 中心".split())
print(model.doesnt_match(u"早餐 晚餐 午餐 中心".split()))
运行结果:
排球 0.6024000644683838
足球运动 0.599421501159668
冰球 0.5516613721847534
所遇问题:
Traceback (most recent call last):
File “”, line 1, in
File “/usr/local/lib/python2.7/dist-packages/gensim/utils.py”, line 1422, in new_func1
return func(*args, **kwargs)
File “/usr/local/lib/python2.7/dist-packages/gensim/models/base_any2vec.py”, line 1397, in most_similar
return self.wv.most_similar(positive, negative, topn, restrict_vocab, indexer)
File “/usr/local/lib/python2.7/dist-packages/gensim/models/keyedvectors.py”, line 509, in most_similar
self.init_sims()
File “/usr/local/lib/python2.7/dist-packages/gensim/models/keyedvectors.py”, line 1366, in init_sims
self.vectors_norm = (self.vectors / sqrt((self.vectors ** 2).sum(-1))[…, newaxis]).astype(REAL)
MemoryError
解决方案:
在加载完Word2Vec模型后,接着输入这一行命令:model.init_sims(replace = True)