当前位置: 首页 > 工具软件 > Whiley > 使用案例 >

Rocchio算法测试测试集时出错:Incompatible dimension for X and Y matrices: X.shape[1]



from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.datasets import fetch_20newsgroups
from sklearn.neighbors.nearest_centroid import NearestCentroid
from pprint import pprint
import sys

newsgroups_train = fetch_20newsgroups(subset='train')
categories = ['alt.atheism','comp.graphics','soc.religion.christian','sci.med']
train_data = fetch_20newsgroups(subset = "train", categories = categories)
count_vect = CountVectorizer()
train_counts = count_vect.fit_transform(train_data.data)
tfidf_transformer = TfidfTransformer()
train_tfidf = tfidf_transformer.fit_transform(train_counts)

clf = NearestCentroid().fit(train_tfidf, train_data.target)
docs_new = {'OpenGL onthe GPU is fast','God is love'}
docs_new_counts = count_vect.fit_transform(docs_new)
docs_new_tfidf = tfidf_transformer.fit_transform(docs_new_counts)

predicted = clf.predict(docs_new_tfidf)

for doc,category in zip(docs_new,predicted):
    print('%r to %s' %(doc,train_data.target_names[category]))

报错:ValueError: Incompatible dimension for X and Y matrices: X.shape[1] == 7 while Y.shape[1] == 35788






from sklearn.neighbors import KNeighborsClassifier
knns = {}
for n_feats in range(1, xtrain.shape[-1] + 1):
    knns[n_feats] = KNeighborsClassifier(n_neighbors=7, weights='distance')
    knns[n_feats].fit(xtrain[:, :n_feats], ytrain)
The classify function should consume two parameters, namely the test data and the dictionary of classifiers. This way you ensure the classification is performed by a classifier that was trained using exactly the same features of the test data (and discarding the others):

def classify(test_data, classifiers):
    """Classify test_data using classifiers[n], which is the classifier
    trained with the first n features of test_data
    X = np.asarray(test_data, dtype='float')
    n_feats = X.shape[-1]
    return classifiers[n_feats].predict(X)
from time import time
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.datasets import fetch_20newsgroups
from sklearn.neighbors.nearest_centroid import NearestCentroid

def classify(test_data, classifiers):
    """Classify test_data using classifiers[n], which is the classifier
    trained with the first n features of test_data
    n_feats = test_data.shape[-1]
    return classifiers[n_feats].predict(test_data)

newsgroups_train = fetch_20newsgroups(subset='train')
categories = ['comp.windows.x','sci.space','comp.sys.ibm.pc.hardware','sci.med']
train_data = fetch_20newsgroups(subset = "train", categories = categories)
count_vect = CountVectorizer()
train_counts = count_vect.fit_transform(train_data.data)
tfidf_transformer = TfidfTransformer()
train_tfidf = tfidf_transformer.fit_transform(train_counts)

knns = {}
for n_feats in range(20, 200):
    knns[n_feats] = NearestCentroid()
    knns[n_feats].fit(train_tfidf[:, :n_feats], train_data.target)

docs_new = {"""And International Space Station crew landed back on Earth in the grasslands of Kazakhstan after spending 186 days in space. Support workers helped the crew emerged safely from their capsule, which was charred by a fiery descent through the atmosphere""",
            """To jump-start adoption of Windows 10 the company is offering free upgrades to the vast majority of home users. Microsoft has set a target of 1 billion devices running windows 10 within three years of launch"""}
docs_new_counts = count_vect.fit_transform(docs_new)
docs_new_tfidf = tfidf_transformer.fit_transform(docs_new_counts)

predicted = classify(docs_new_tfidf, knns)
for doc,category in zip(docs_new,predicted):
    print('%r to %s' %(doc,train_data.target_names[category])
['comp.sys.ibm.pc.hardware', 'comp.windows.x', 'sci.med', 'sci.space']
[1 3]
'To jump-start adoption of Windows 10 the company is offering free upgrades to the vast majority of home users. Microsoft has set a target of 1 billion devices running windows 10 within three years of launch' to comp.windows.x
'And International Space Station crew landed back on Earth in the grasslands of Kazakhstan after spending 186 days in space. Support workers helped the crew emerged safely from their capsule, which was charred by a fiery descent through the atmosphere' to sci.space
