





from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.datasets import fetch_20newsgroups

import matplotlib.pyplot as plt
from wordcloud import WordCloud

import re

import nltk
from nltk import word_tokenize
from nltk.tokenize import wordpunct_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer 
from nltk.stem import SnowballStemmer 
from nltk.stem import PorterStemmer 
from nltk.corpus import wordnet

from nltk.corpus import movie_reviews

from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.feature_selection import chi2
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
import numpy as np #'import numpy'
import pandas as pd
import random

# these are added for importing a corpus from a zipfile
from zipfile import ZipFile
import os.path
from os import path




stop_words = None
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as sklearn_stop_words
nltk_stop_words = nltk.corpus.stopwords.words('english')



# nice preview of document
def get_preview(docs, targets, target_names, doc_id, max_len=0):
    preview = ''
    if max_len < 1:
        preview += 'Label\n'
        preview += '=====\n'
        preview += str(doc_id)
        preview += '\t'
    if isinstance(targets[doc_id], str):
        preview += targets[doc_id]
        preview += target_names[targets[doc_id]]
    if max_len < 1:
        preview += '\n\nFull Text\n'
        preview += '=========\n'
        preview += docs[doc_id]
        preview += '\n'
        excerpt = get_excerpt(docs[doc_id], max_len)
        preview += '\t' + excerpt
    return preview

_RE_COMBINE_WHITESPACE = re.compile(r"\s+")

# generate an excerpt
def get_excerpt(text, max_len):
    excerpt = _RE_COMBINE_WHITESPACE.sub(' ',text[0:max_len])
    if max_len < len(text):
        excerpt += '...'
    return excerpt.strip()

# combine a defined stop word list (or no stop word list) with any extra stop words defined
def set_stop_words(stop_word_list, extra_stop_words):
    if len(extra_stop_words) > 0:
        if stop_word_list is None:
            stop_word_list = []
        stop_words = list(stop_word_list) + extra_stop_words
        stop_words = stop_word_list
    return stop_words

# initiate stemming or lemmatising
def set_normaliser(normalise):
    if normalise == 'PorterStemmer':
        normaliser = PorterStemmer()
    elif normalise == 'SnowballStemmer':
        normaliser = SnowballStemmer('english')
    elif normalise == 'WordNetLemmatizer':
        normaliser = WordNetLemmatizer()
        normaliser = None
    return normaliser

# using a custom tokenisation process to allow different tokenisers and stemming/lemmatising ...
def tokenise(doc):
    global tokeniser, normalise, normaliser
    # you could obviously add more tokenisers here if you wanted ...
    if tokeniser == 'sklearn':
        tokenizer = RegexpTokenizer(r"(?u)\b\w\w+\b") # this is copied straight from sklearn source
        tokens = tokenizer.tokenize(doc)
    elif tokeniser == 'word_tokenize':
        tokens = word_tokenize(doc)
    elif tokeniser == 'wordpunct':
        tokens = wordpunct_tokenize(doc)
    elif tokeniser == 'nopunct':
        tokenizer = RegexpTokenizer(r'\w+')
        tokens = tokenizer.tokenize(doc)
        tokens = word_tokenize(doc)
    # if using a normaliser then iterate through tokens and return the normalised tokens ...
    if normalise == 'PorterStemmer':
        return [normaliser.stem(t) for t in tokens]
    elif normalise == 'SnowballStemmer':
        return [normaliser.stem(t) for t in tokens]
    elif normalise == 'WordNetLemmatizer':
        # NLTK's lemmatiser needs parts of speech, otherwise assumes everything is a noun
        pos_tokens = nltk.pos_tag(tokens)
        lemmatised_tokens = []
        for token in pos_tokens:
            # NLTK's lemmatiser needs specific values for pos tags - this rewrites them ...
            # default to noun
            tag = wordnet.NOUN
            if token[1].startswith('J'):
                tag = wordnet.ADJ
            elif token[1].startswith('V'):
                tag = wordnet.VERB
            elif token[1].startswith('R'):
                tag = wordnet.ADV
        return lemmatised_tokens
        # no normaliser so just return tokens
        return tokens

# CountVectorizer pre-processor - remove numerics.
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    return text

##从 zip 文件加载语料库。语料库应该将每个类中的文档放在单独的目录中,语料库数据集zip文件应与本程序放在同一文件夹内。运行下述内容解压缩。(用于案例的数据集名为'UN-AUvNZ.zip',在这里标红。)

corpus_filename = 'UN-AUvNZ.zip'

# Create a ZipFile Object and load sample.zip in it
with ZipFile(corpus_filename, 'r') as zipObj:
   # Extract all the contents of zip file in current directory


corpus_dirname = 'UN-AUvNZ'

if path.isdir(corpus_dirname):
    print('Directory exists:', corpus_dirname)
    print('Directory does not exist!: ', corpus_dirname)

##如果确实生成,会显示'Directory exists:'


dataset = load_files(corpus_dirname, encoding='utf-8')
dataset_target_names = dataset.target_names

##print的结果就是['AU', 'NZ']


# assign the train/test split - 0.2 is 80% for training, 20% for testing
test_size = 0.2

# keeping data structure compatible with previous notebook
dataset_target = []
for target in dataset.target:
# do the train test split ...
# docs_train and docs_test are the documents
# y_train and y_test are the labels

docs_train, docs_test, y_train, y_test = train_test_split(dataset.data, 
                                                          test_size = test_size, 


for train_id in range(len(docs_train)):
    print(get_preview(docs_train, y_train, dataset_target_names, train_id, max_len=80))



train_id = 11
print(get_preview(docs_train, y_train, dataset_target_names, train_id))

##下一步,进行特征提取/预处理。下一部分将使用朴素贝叶斯(Naive Bayes)和单词包模型。在这段代码之后,进行统一说明。

Vectorizer = CountVectorizer # Choose between token counts or tf-idf weights, valid values are CountVectorizer or TfidfVectorizer without quotes
lowercase = True # valid values are False or True (to make it lowercase). Setting lowercase to True will transform all document text to lowercase. Setting it to False will not do this transformation.
tokeniser = 'sklearn' # should be one of the following (with quotes): 'sklearn', 'word_tokenize', wordpunct', 'nopunct' 
normalise = None # should be one of the following: None or 'PorterStemmer' or 'SnowballStemmer' or 'WordNetLemmatizer'
stop_word_list = None # should be either None or nltk_stop_words or sklearn_stop_words
extra_stop_words = [] #list of any extra stop words e.g. ['one' , 'two']
min_df = 0.0 #ignores terms that occur below a minimum proportion of documents.
max_df = 1.0 #ignores terms that occur below a maximum proportion of documents.
max_features = 1000 #set this to None for no limit or set to the maximum number of the most frequent features.
ngram_range = (1, 1) #Ngram*
encoding = 'utf-8'
decode_error = 'ignore' # what to do if contains characters not of the given encoding - options 'strict', 'ignore', 'replace'



kbest = 100


# 如果前半部分有任何改变(因为存在随机部分),这段要重新run。

stop_words = set_stop_words(stop_word_list, extra_stop_words)
normaliser = set_normaliser(normalise)

pipeline = Pipeline([
    ('vectorizer', Vectorizer(tokenizer = tokenise,
                              lowercase = lowercase,
                              min_df = min_df, 
                              max_df = max_df, 
                              max_features = max_features,
                              stop_words = stop_words, 
                              ngram_range = ngram_range,
                              encoding = encoding, 
                              preprocessor = preprocess_text,
                              decode_error = decode_error)),
    ('selector', SelectKBest(score_func = mutual_info_classif, k=kbest)),
    ('classifier', MultinomialNB()), #here is where you would specify an alternative classifier

print('Classifier settings')
print('classifier:', type(pipeline.steps[2][1]).__name__)
print('selector:', type(pipeline.steps[1][1]).__name__)
print('vectorizer:', type(pipeline.steps[0][1]).__name__)
print('classes:', dataset_target_names)
print('lowercase:', lowercase)
print('tokeniser:', tokeniser)
print('normalise:', normalise)
print('min_df:', min_df)
print('max_df:', max_df)
print('max_features:', max_features)
if stop_word_list == nltk_stop_words:
    print('stop_word_list:', 'nltk_stop_words')
elif stop_word_list == sklearn_stop_words:
    print('stop_word_list:', 'sklearn_stop_words')
    print('stop_word_list:', 'None')
print('extra_stop_words:', extra_stop_words)
print('ngram_range:', ngram_range)
print('encoding:', encoding)
print('decode_error:', decode_error)
print('kbest:', kbest)


Classifier settings
classifier: MultinomialNB
selector: SelectKBest
vectorizer: CountVectorizer
classes: ['AU', 'NZ']
lowercase: True
tokeniser: sklearn
normalise: None
min_df: 0.0
max_df: 1.0
max_features: 1000
stop_word_list: None
extra_stop_words: []
ngram_range: (1, 1)
encoding: utf-8
decode_error: ignore
kbest: 100


pipeline.fit(docs_train, y_train)
y_predicted = pipeline.predict(docs_test)

# print report
print('Evaluation metrics')
print(metrics.classification_report(y_test, y_predicted, target_names = dataset_target_names))
cm = metrics.confusion_matrix(y_true=y_test, y_pred=y_predicted, labels=dataset_target_names)

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=dataset_target_names)
disp = disp.plot(include_values=True, cmap='Blues', ax=None, xticks_rotation='vertical')

vect = pipeline.steps[0][1]
clf = pipeline.steps[2][1]

lookup = dict((v,k) for k,v in vect.vocabulary_.items())



# Get SelectKBest feature scores
features = pipeline.named_steps['selector']
# get top k feature indices
cols = features.get_support(indices=True)
# get corresponding feature scores 
top_k_feature_scores  = [features.scores_[i] for i in cols]

# get vectorizer
featnames = pipeline.named_steps['vectorizer']
# get all feature names
fred = featnames.get_feature_names()
# get corresponding feature names
top_k_feature_names = [fred[i] for i in cols]

names_scores = list(zip(top_k_feature_names, top_k_feature_scores, logodds))
ns_df = pd.DataFrame(data = names_scores, columns=['Feature_names', 'Feature_Scores', 'Log odds'])

#Sort the dataframe for better visualization - change this to reorder by 
print('Top features by information gain')
ns_df_sorted = ns_df.sort_values(['Feature_Scores', 'Log odds', 'Feature_names'], ascending = [False, False, False])

print('Features most indicative of', dataset_target_names[0])
ns_df_sorted = ns_df.sort_values(['Log odds', 'Feature_Scores', 'Feature_names'], ascending = [True, False, False])

print('Features most indicative of', dataset_target_names[1])
ns_df_sorted = ns_df.sort_values(['Log odds', 'Feature_Scores', 'Feature_names'], ascending = [False, False, False])


print('Total Features: ',len(vect.get_feature_names()))


print('Total Features: ',len(ns_df['Feature_names']))


# setup a counter for each cell in the confusion matrix
counter = {}
previews = {}
for true_target, target_name in enumerate(dataset_target_names):
    counter[target_name] = {}
    previews[target_name] = {}
    for predicted_target, ptarget_name in enumerate(dataset_target_names):
        counter[target_name][ptarget_name] = {}
        previews[target_name][ptarget_name] = ''

# get doc-term matrix for test docs
doc_terms = vect.transform(docs_test)

# iterate through all predictions, building the counter and preview of docs
# there is a better way to do this, but this will do!

for doc_id, prediction in enumerate(pipeline.predict(docs_test)):
    for k, v in enumerate(doc_terms[doc_id].toarray()[0]):
        if v > 0 and lookup[k] in list(ns_df['Feature_names']):
            if lookup[k] not in counter[y_test[doc_id]][prediction]:
                counter[y_test[doc_id]][prediction][lookup[k]] = 0
            counter[y_test[doc_id]][prediction][lookup[k]] += v
    previews[y_test[doc_id]][prediction] += get_preview(docs_test, y_test, dataset_target_names, doc_id, max_len=80) + '\n'

# output a wordcloud and preview of docs for each cell of confusion matrix ...
for true_target, target_name in enumerate(dataset_target_names):
    for predicted_target, ptarget_name in enumerate(dataset_target_names):
        if true_target == predicted_target:
            print(dataset_target_names[true_target],'Correctly classified')
            print(dataset_target_names[true_target],'incorrectly classified as',dataset_target_names[predicted_target])

        wordcloud = WordCloud(background_color="white", width=800, height=600, color_func=lambda *args, **kwargs: "black").generate_from_frequencies(counter[target_name][ptarget_name])
        plt.figure(figsize=(16, 8), dpi= 600)
        plt.imshow(wordcloud, interpolation="bilinear")


test_id = 4


print(get_preview(docs_test, y_test, dataset_target_names, test_id))

for k, v in enumerate(vect.transform([docs_test[test_id]]).toarray()[0]):
    if v > 0 and lookup[k] in list(ns_df['Feature_names']):
        print(v, '\t', lookup[k])
