#1) 导入常用的package,改变工作路径
#2) 合并25条每日新闻,分割训练集合与测试集合
#3) 提取特征features(简单版) 【优化:对文本经过了NLTK处理】
#4) 训练模型-SVM,基于高斯核函数
#5) 预测 & 评价标准 ROC-AUC
知识储备如下:
详细代码如下:
#1) 导入常用的package,改变工作路径
import pandas as pd
import numpy as np
import os
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from datetime import date
path = 'F:\Downloads\lecture02'
os.chdir(path)
data = pd.read_csv('Combined_News_DJIA.csv')
print(data.head())
#2) 合并25条每日新闻,分割训练集合与测试集合
#pandas.DataFrame.filter: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.filter.html
data["combined_news"] = data.filter(regex=("Top.*")).apply(lambda x: ''.join(str(x.values)), axis=1)
#通过时间挑选训练集,测试集
train = data[data['Date'] < '2015-01-01']
test = data[data['Date'] > '2014-12-31']
#不小心发现,这个结果还不如之前的简单版;
#造成如此的原因有几种:•数据点太少,•One-Off result;
#我们到现在都只是跑了一次而已。如果我们像前面的例子一样,用Cross Validation来玩这组数据,说不定我们会发现,分数高的clf其实是overfitted了的。
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
max_iter=-1, probability=True, random_state=None, shrinking=True,
tol=0.001, verbose=False)
#3) 提取特征features(简单版)
#这里直接调用sklearn中的TF-IDF的包来做feature,这里没有preprocessing步骤
feature_extraction = TfidfVectorizer()
#把训练集,先fit里面的文本信息,然后在来transform转换成TIIDF的格式
X_train = feature_extraction.fit_transform(train["combined_news"].values)
#这里因为feature_extraction记住了训练的格式,转换测试集的格式
X_test = feature_extraction.transform(test["combined_news"].values)
y_train = train["Label"].values
y_test = test["Label"].values
#4) 训练模型-SVM,基于高斯核函数:sklearn.svm.SVC — scikit-learn 0.19.1 documentation
#参考:https://blog.csdn.net/gamer_gyt/article/details/51265347
clf = SVC(probability=True, kernel = 'rbf')
#把X_train, y_train丢进去, 找出模型最佳参数
clf_train = clf.fit(X_train, y_train)
print(clf_train)
输出模型最佳参数:
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
max_iter=-1, probability=True, random_state=None, shrinking=True,
tol=0.001, verbose=False)
#5) 预测 & 评价标准
#参考:ROC和AUC介绍以及如何计算AUC:http://alexkong.net/2013/06/introduction-to-auc-and-roc/
predictions = clf.predict_proba(X_test)
print('ROC-AUC yields' + str(roc_auc_score(y_test, predictions[:,1])))
输出第一次结果:
ROC-AUC yields0.574260752688
#【微小进阶版 - NLTK】
# 文本预处理中,我们这样直接把文本放进TF-IDF,虽然简单方便,但是还是不够严谨的。 我们可以把原文本做进一步的处理。
# 把句子中分词,全部小写,去除停用词,删除一些数字,使用nltk中的lemma取出词根
# 因为外部库,比如sklearn 只支持string输入,所以我们把调整后的list再变回string: apply(lambda x : ' '.join(x))
# 后面流程一样:fit数据 -> SVM方式跑一遍 -> 看下ROC-AUC值
X_train = train["combined_news"].str.lower().str.replace('"', '').str.replace(".", '').str.split()
X_test = test["combined_news"].str.lower().str.replace('"', '').str.replace(".", '').str.split()
print(X_test[1611]) #分词后的25个句子
输出分词后输入文本:
['[', "'most", 'cases', 'of', 'cancer', 'are', 'the', 'result', 'of', 'sheer', 'bad', 'luck', 'rather', 'than', 'unhealthy', 'lifestyles,', 'diet', 'or', 'even', 'inherited', 'genes,', 'new', 'research', 'suggests', 'random', 'mutations', 'that', 'occur', 'in', 'dna', 'when', 'cells', 'divide', 'are', 'responsible', 'for', 'two', 'thirds', 'of', 'adult', 'cancers', 'across', 'a', 'wide', 'range', 'of', "tissues'", "'iran", 'dismissed', 'united', 'states', 'efforts', 'to', 'fight', 'islamic', 'state', 'as', 'a', 'ploy', 'to', 'advance', 'us', 'policies', 'in', 'the', 'region:', 'the', 'reality', 'is', 'that', 'the', 'united', 'states', 'is', 'not', 'acting', 'to', 'eliminate', 'daesh', 'they', 'are', 'not', 'even', 'interested', 'in', 'weakening', 'daesh,', 'they', 'are', 'only', 'interested', 'in', 'managing', "it'", "'poll:", 'one', 'in', '8', 'germans', 'would', 'join', 'anti-muslim', "marches'", 'uk', 'royal', "family's", 'prince', 'andrew', 'named', 'in', 'us', 'lawsuit', 'over', 'underage', 'sex', 'allegations', "'some", '40', 'asylum-seekers', 'refused', 'to', 'leave', 'the', 'bus', 'when', 'they', 'arrived', 'at', 'their', 'destination', 'in', 'rural', 'northern', 'sweden,', 'demanding', 'that', 'they', 'be', 'taken', 'back', 'to', 'malm', 'or', 'some', 'big', "city'", 'pakistani', 'boat', 'blows', 'self', 'up', 'after', 'india', 'navy', 'chase', 'all', 'four', 'people', 'on', 'board', 'the', 'vessel', 'from', 'near', 'the', 'pakistani', 'port', 'city', 'of', 'karachi', 'are', 'believed', 'to', 'have', 'been', 'killed', 'in', 'the', 'dramatic', 'episode', 'in', 'the', 'arabian', 'sea', 'on', 'new', "year's", 'eve,', 'according', 'to', "india's", 'defence', 'ministry', "'sweden", 'hit', 'by', 'third', 'mosque', 'arson', 'attack', 'in', 'a', "week'", "'940", 'cars', 'set', 'alight', 'during', 'french', 'new', "year'", "'salaries", 'for', 'top', 'ceos', 'rose', 'twice', 'as', 'fast', 'as', 'average', 'canadian', 'since', 'recession:', "study'", "'norway", 'violated', 'equal-pay', 'law,', 'judge', 'says:', 'judge', 'finds', 'consulate', 'employee', 'was', 'unjustly', 'paid', '$30,000', 'less', 'than', 'her', 'male', "counterpart'", "'imam", 'wants', 'radical', 'recruiters', 'of', 'muslim', 'youth', 'in', 'canada', 'identified', 'and', 'dealt', "with'", "'saudi", 'arabia', 'beheaded', '83', 'people', 'in', '2014,', 'the', 'most', 'in', "years'", "'a", 'living', "hell'", 'for', 'slaves', 'on', 'remote', 'south', 'korean', 'islands', '-', 'slavery', 'thrives', 'on', 'this', 'chain', 'of', 'rural', 'islands', 'off', 'south', "korea's", 'rugged', 'southwest', 'coast,', 'nurtured', 'by', 'a', 'long', 'history', 'of', 'exploitation', 'and', 'the', 'demands', 'of', 'trying', 'to', 'squeeze', 'a', 'living', 'from', 'the', 'sea', "'worlds", '400', 'richest', 'get', 'richer,', 'adding', '$92bn', 'in', "2014'", "'rental", 'car', 'stereos', 'infringe', 'copyright,', 'music', 'rights', 'group', "says'", "'ukrainian", 'minister', 'threatens', 'tv', 'channel', 'with', 'closure', 'for', 'airing', 'russian', "entertainers'", "'palestinian", 'president', 'mahmoud', 'abbas', 'has', 'entered', 'into', 'his', 'most', 'serious', 'confrontation', 'yet', 'with', 'israel', 'by', 'signing', 'onto', 'the', 'international', 'criminal', 'court', 'his', 'decision', 'on', 'wednesday', 'gives', 'the', 'court', 'jurisdiction', 'over', 'crimes', 'committed', 'in', 'palestinian', "lands'", 'israeli', 'security', 'center', 'publishes', 'names', 'of', '50', 'killed', 'terrorists', "'concealed", 'by', "hamas'", 'the', 'year', '2014', 'was', 'the', 'deadliest', 'year', 'yet', 'in', "syria's", 'four-year', 'conflict,', 'with', 'over', '76,000', 'killed', "'a", 'secret', 'underground', 'complex', 'built', 'by', 'the', 'nazis', 'that', 'may', 'have', 'been', 'used', 'for', 'the', 'development', 'of', 'wmds,', 'including', 'a', 'nuclear', 'bomb,', 'has', 'been', 'uncovered', 'in', "austria'", "'restrictions", 'on', 'web', 'freedom', 'a', 'major', 'global', 'issue', 'in', "2015'", "'austrian", 'journalist', 'erich', 'mchel', 'delivered', 'a', 'presentation', 'in', 'hamburg', 'at', 'the', 'annual', 'meeting', 'of', 'the', 'chaos', 'computer', 'club', 'on', 'monday', 'december', '29,', 'detailing', 'the', 'various', 'locations', 'where', 'the', 'us', 'nsa', 'has', 'been', 'actively', 'collecting', 'and', 'processing', 'electronic', 'intelligence', 'in', "vienna'", "'thousands", 'of', 'ukraine', 'nationalists', 'march', 'in', "kiev'", "'chinas", 'new', 'years', 'resolution:', 'no', 'more', 'harvesting', 'executed', 'prisoners', "organs'", 'authorities', 'pull', 'plug', 'on', "russia's", 'last', 'politically', 'independent', 'tv', 'station]']
#删除停止词语
from nltk.corpus import stopwords
stop = stopwords.words('english')
#删除数字
import re
def hasNumbers(inputString):
return bool(re.search(r'\d',inputString ))
#取出词根
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
#定义一个函数把需要的词语保留下来
def check(word):
if word in stop:
return False
elif hasNumbers(word):
return False
else:
return True
#把这些规则运用在X_train, X_test中, 留下来的单词做lemma处理
X_train = X_train.apply(lambda x: [wordnet_lemmatizer.lemmatize(item) for item in x if check(item)])
X_test = X_test.apply(lambda x: [wordnet_lemmatizer.lemmatize(item) for item in x if check(item)])
print(X_test[1611]) #处理过一波的25个词语,格式为list
输出去除停用词、数字、提取词干后的文本:
['[', "'most", 'case', 'cancer', 'result', 'sheer', 'bad', 'luck', 'rather', 'unhealthy', 'lifestyles,', 'diet', 'even', 'inherited', 'genes,', 'new', 'research', 'suggests', 'random', 'mutation', 'occur', 'dna', 'cell', 'divide', 'responsible', 'two', 'third', 'adult', 'cancer', 'across', 'wide', 'range', "tissues'", "'iran", 'dismissed', 'united', 'state', 'effort', 'fight', 'islamic', 'state', 'ploy', 'advance', 'u', 'policy', 'region:', 'reality', 'united', 'state', 'acting', 'eliminate', 'daesh', 'even', 'interested', 'weakening', 'daesh,', 'interested', 'managing', "it'", "'poll:", 'one', 'german', 'would', 'join', 'anti-muslim', "marches'", 'uk', 'royal', "family's", 'prince', 'andrew', 'named', 'u', 'lawsuit', 'underage', 'sex', 'allegation', "'some", 'asylum-seekers', 'refused', 'leave', 'bus', 'arrived', 'destination', 'rural', 'northern', 'sweden,', 'demanding', 'taken', 'back', 'malm', 'big', "city'", 'pakistani', 'boat', 'blow', 'self', 'india', 'navy', 'chase', 'four', 'people', 'board', 'vessel', 'near', 'pakistani', 'port', 'city', 'karachi', 'believed', 'killed', 'dramatic', 'episode', 'arabian', 'sea', 'new', "year's", 'eve,', 'according', "india's", 'defence', 'ministry', "'sweden", 'hit', 'third', 'mosque', 'arson', 'attack', "week'", 'car', 'set', 'alight', 'french', 'new', "year'", "'salaries", 'top', 'ceo', 'rose', 'twice', 'fast', 'average', 'canadian', 'since', 'recession:', "study'", "'norway", 'violated', 'equal-pay', 'law,', 'judge', 'says:', 'judge', 'find', 'consulate', 'employee', 'unjustly', 'paid', 'le', 'male', "counterpart'", "'imam", 'want', 'radical', 'recruiter', 'muslim', 'youth', 'canada', 'identified', 'dealt', "with'", "'saudi", 'arabia', 'beheaded', 'people', "years'", "'a", 'living', "hell'", 'slave', 'remote', 'south', 'korean', 'island', '-', 'slavery', 'thrives', 'chain', 'rural', 'island', 'south', "korea's", 'rugged', 'southwest', 'coast,', 'nurtured', 'long', 'history', 'exploitation', 'demand', 'trying', 'squeeze', 'living', 'sea', "'worlds", 'richest', 'get', 'richer,', 'adding', "'rental", 'car', 'stereo', 'infringe', 'copyright,', 'music', 'right', 'group', "says'", "'ukrainian", 'minister', 'threatens', 'tv', 'channel', 'closure', 'airing', 'russian', "entertainers'", "'palestinian", 'president', 'mahmoud', 'abbas', 'entered', 'serious', 'confrontation', 'yet', 'israel', 'signing', 'onto', 'international', 'criminal', 'court', 'decision', 'wednesday', 'give', 'court', 'jurisdiction', 'crime', 'committed', 'palestinian', "lands'", 'israeli', 'security', 'center', 'publishes', 'name', 'killed', 'terrorist', "'concealed", "hamas'", 'year', 'deadliest', 'year', 'yet', "syria's", 'four-year', 'conflict,', 'killed', "'a", 'secret', 'underground', 'complex', 'built', 'nazi', 'may', 'used', 'development', 'wmds,', 'including', 'nuclear', 'bomb,', 'uncovered', "austria'", "'restrictions", 'web', 'freedom', 'major', 'global', 'issue', "'austrian", 'journalist', 'erich', 'mchel', 'delivered', 'presentation', 'hamburg', 'annual', 'meeting', 'chaos', 'computer', 'club', 'monday', 'december', 'detailing', 'various', 'location', 'u', 'nsa', 'actively', 'collecting', 'processing', 'electronic', 'intelligence', "vienna'", "'thousands", 'ukraine', 'nationalist', 'march', "kiev'", "'chinas", 'new', 'year', 'resolution:', 'harvesting', 'executed', 'prisoner', "organs'", 'authority', 'pull', 'plug', "russia's", 'last', 'politically', 'independent', 'tv', 'station]']
#接下来导入sklearn中训练,但是格式不对,需要把X_train, X_test 转换为string格式,目前是list格式
X_train = X_train.apply(lambda x: ' '.join(x))
X_test = X_test.apply(lambda x: ' '.join(x))
print(X_test[1611]) #处理过一波的25个词语,格式为string
输出,把list转换为string后结果:
[ 'most case cancer result sheer bad luck rather unhealthy lifestyles, diet even inherited genes, new research suggests random mutation occur dna cell divide responsible two third adult cancer across wide range tissues' 'iran dismissed united state effort fight islamic state ploy advance u policy region: reality united state acting eliminate daesh even interested weakening daesh, interested managing it' 'poll: one german would join anti-muslim marches' uk royal family's prince andrew named u lawsuit underage sex allegation 'some asylum-seekers refused leave bus arrived destination rural northern sweden, demanding taken back malm big city' pakistani boat blow self india navy chase four people board vessel near pakistani port city karachi believed killed dramatic episode arabian sea new year's eve, according india's defence ministry 'sweden hit third mosque arson attack week' car set alight french new year' 'salaries top ceo rose twice fast average canadian since recession: study' 'norway violated equal-pay law, judge says: judge find consulate employee unjustly paid le male counterpart' 'imam want radical recruiter muslim youth canada identified dealt with' 'saudi arabia beheaded people years' 'a living hell' slave remote south korean island - slavery thrives chain rural island south korea's rugged southwest coast, nurtured long history exploitation demand trying squeeze living sea 'worlds richest get richer, adding 'rental car stereo infringe copyright, music right group says' 'ukrainian minister threatens tv channel closure airing russian entertainers' 'palestinian president mahmoud abbas entered serious confrontation yet israel signing onto international criminal court decision wednesday give court jurisdiction crime committed palestinian lands' israeli security center publishes name killed terrorist 'concealed hamas' year deadliest year yet syria's four-year conflict, killed 'a secret underground complex built nazi may used development wmds, including nuclear bomb, uncovered austria' 'restrictions web freedom major global issue 'austrian journalist erich mchel delivered presentation hamburg annual meeting chaos computer club monday december detailing various location u nsa actively collecting processing electronic intelligence vienna' 'thousands ukraine nationalist march kiev' 'chinas new year resolution: harvesting executed prisoner organs' authority pull plug russia's last politically independent tv station]
#准备跑一遍模型
feature_extraction = TfidfVectorizer(lowercase = False)
X_train = feature_extraction.fit_transform(X_train.values)
X_test = feature_extraction.transform(X_test.values)
clf = SVC(probability=True, kernel = 'rbf')
clf.fit(X_train, y_train)
predictions = clf.predict_proba(X_test)
print('ROC-AUC yields' + str(roc_auc_score(y_test, predictions[:, 1])))
输出优化的模型结果:
ROC-AUC yields0.548611111111