当前位置: 首页 > 工具软件 > ML-NLP > 使用案例 >

Python--NLP自然语言处理常用API

龚远
2023-12-01

NLP自然语言处理的常用总结

nltk自然语言处理工具(natural language toolkits)

1. 文本提取

API:

# 导入模块
import nltk.tokenize as tk
# 提取句子
sent_list = tk.sent_tokenize('')
# 提取单词
word_list = tk.word_tokenize('')_

# 提取单词对象
punctTokenizer = tk.WordPunctTokenizer()
word_list = punctTokenizer.tokenize('')

2. 提干

API:

import nltk.stem.porter as pt
import nltk.stem.Lancaster as lc
import nltk.stem.snowball as sb

# 波特词干提取器,偏宽松
pt_stemmer = pt.PorterStemmer()
# 朗卡斯特词干提取器,偏严格
lc_stemmer = lc.LancasterStemmer()
# 斯诺博词干提取器,中庸
sb_stemmer = sb.SnowballStemmer()
# 实现提取过程
r = pt_stemmer.stem('playing')

3. 词性还原

API:

import nltk.stem as ns
# 词性还原器对象
lemmatizer = ns.WordNetLemmatizer()
n = lemmatizer.lemmatize('',pos = 'n')
v = lemmatizer.lemmatize('',pos = 'v')

词袋模型

基本思想:单词出现的次数很大程度上决定了该句话的意思。讲一句话作为样本,这句话中的每个单词作为特征,单词在句子中出现的次数作为特征值构建的数学模型就叫做词袋模型。
API:

import sklearn.feature_extraction.text as ft

# 构建磁带模型对象
cv = ft.CountVectorizer()
# 训练模型
bow = cv.fit_transform(sentence).toarray() # 不toarray的结果为稀疏矩阵
print(bow)
# 获取特征名
words = cv.get_feature_names()

TF-IDF词频-逆文档频率

(term frequency–inverse document frequency)
思想:值越大,贡献越大;根据贡献力度,构建学习模型
API:

import sklearn.feature_extraciton.text as ft
# 获取一个词袋模型
cv = ft.CountVetorizer()
bow = cv.fit_transform(sentence).toarray()
# 获取tf-idf模型训练器
tt = ft.TfidfTransformer()
tfidf = tt.fit_transform(bow).toarray()

# 最后结果为    有几句话就有几行,len(cv.get_feature_names)列的数组

自然语言处理三大应用场景及案例

文本分类

主要方法: 构建词袋模型处理文本,再生成tf-idf矩阵,再选取合适的训练模型。

案例:新闻类型识别

API:

import sklearn.datasets as sd
import sklearn.feature_extraction.text as ft
import sklearn.naive_bayes as nb
import numpy as np

train = sd.load_files(
    '../ml_data/20news',  encoding='latin1',
    shuffle=True, random_state=7)
train_x = np.array(train.data)   # 样本输入
train_y = np.array(train.target)  # 样本输出 [0,1,2,0,1,1..]
categories = train.target_names

# 整理样本,获取tf-idf矩阵,训练模型
cv = ft.CountVectorizer()
bow = cv.fit_transform(train_x)
tt = ft.TfidfTransformer()
tfidf = tt.fit_transform(bow)

# 使用基于多项分布的朴素贝叶斯训练模型
model = nb.MultinomialNB()
model.fit(tfidf, train_y)

# 预测:
test_data = [
    'The curveballs of right handed pitchers '
    'tend to curve to the left',
    'Caesar cipher is an ancient form of encryption',
    'This two-wheeler is really good on slippery roads']
test_bow = cv.transform(test_data)
test_tfidf = tt.transform(test_bow)
pred_y = model.predict(test_tfidf)

for sent, index in zip(test_data, pred_y):
    print(sent, '->', categories[index])

使用nltk提供的分类器nltk.classify进行划分训练。

案例:性别识别

API:

import random
import numpy as np
import nltk.corpus as nc
import nltk.classify as cf
male_names = nc.names.words('male.txt')
female_names = nc.names.words('female.txt')

data = []
for male_name in male_names:
    feature = {'feature': male_name[-2:].lower()}
    data.append((feature, 'male'))
for female_name in female_names:
    feature = {'feature': female_name[-2:].lower()}
    data.append((feature, 'female'))
random.seed(7)
random.shuffle(data)
train_data = data[:int(len(data) / 2)]
test_data = data[int(len(data) / 2):]
model = cf.NaiveBayesClassifier.train(train_data)
ac = cf.accuracy(model, test_data)
names, genders = ['Leonardo', 'Amy', 'Sam', 'Tom', 'Katherine', 'Taylor',
'Susanne'], []
for name in names:
    feature = {'feature': name[-2:].lower()}
    gender = model.classify(feature)
    genders.append(gender)
for name, gender in zip(names, genders):
    print(name, '->', gender)

情感分析

分析框架:整理数据、训练模型、模拟预测

案例:电影评价

对电影评价进行训练,得出评价是积极的还是消极的,进行预测。

import nltk.corpus as nc
import nltk.classify as cf
import nltk.classify.util as cu
pdata = []
# pos文件夹中的每个文件的路径
fileids = nc.movie_reviews.fileids('pos')
# 整理所有正面评论单词,存入pdata列表
for fileid in fileids:
    sample = {}
    words = nc.movie_reviews.words(fileid)
    for word in words:
    sample[word] = True
    pdata.append((sample, 'POSITIVE'))
# 整理所有正面评论单词,存入ndata列表
ndata = []
fileids = nc.movie_reviews.fileids('neg')
for fileid in fileids:
    sample = {}
    words = nc.movie_reviews.words(fileid)
    for word in words:
    sample[word] = True
    ndata.append((sample, 'NEGATIVE'))
# 拆分测试集与训练集数量(80%作为训练集)
pnumb, nnumb = int(0.8 * len(pdata)), int(0.8 * len(ndata))
train_data = pdata[:pnumb] + ndata[:nnumb]
test_data = pdata[pnumb:] + ndata[nnumb:]
# 基于朴素贝叶斯分类器训练测试数据
model = cf.NaiveBayesClassifier.train(train_data)
ac = cu.accuracy(model, test_data)
print(ac)
# 模拟业务场景
reviews = [
'It is an amazing movie.',
'This is a dull movie. I would never recommend it to anyone.',
'The cinematography is pretty great in this movie.',
'The direction was terrible and the story was all over the place.']
sents, probs = [], []
for review in reviews:
    sample = {}
    words = review.split()
    for word in words:
        sample[word] = True
    pcls = model.classify(sample)
    print(review, '->', pcls)

主题提取

主要思想:经过分词、单词清洗、词干提取后,基于TF-IDF算法可以抽取一段文本中的核心主题词汇,从而判断出当前文本的主题。属于无监督学习。

案例:文本主题提取

import nltk.tokenize as tk
import nltk.corpus as nc
import nltk.stem.snowball as sb
import gensim.models.ldamodel as gm
import gensim.corpora as gc
doc = []
with open('../../ml_data/topic.txt', 'r') as f:
    for line in f.readlines():
        doc.append(line[:-1])
# 创建分词器对象
tokenizer = tk.WordPunctTokenizer()
# 设置停用词
stopwords = nc.stopwords.words('english')
signs = [',', '.', '!']
# 斯诺博提干
stemmer = sb.SnowballStemmer('english')

lines_tokens = []
for line in doc:
    # 提词 tokens是一个列表[word,word,word,...]
    tokens = tokenizer.tokenize(line.lower())
    line_tokens = []
    for token in tokens:
        if token not in stopwords and token not in signs:
            # 对每一个单词提干,不包括停用词和标点符号
            token = stemmer.stem(token)
            line_tokens.append(token)
    lines_tokens.append(line_tokens)

# 把lines_tokens中出现的单词都存入gc提供的词典对象,对每一个单词做编码。
dic = gc.Dictionary(lines_tokens)

# 遍历每一行,构建词袋列表
bow = []
for line_tokens in lines_tokens:
    row = dic.doc2bow(line_tokens)
    bow.append(row)

# 设置分类数
n_topics = 2
# 通过词袋、分类数、词典、每个主题保留的最大主题词个数构建LDA模型
model = gm.LdaModel(bow, num_topics=n_topics, id2word=dic, passes=25)
# 输出每个类别中对类别贡献最大的4个主题词
topics = model.print_topics(num_topics=n_topics, num_words=4)
print(topics)

 类似资料: