对应数据的链接放这里了 import pandas as pd from util.logger import Log import os from util.data_dir import root_dir from LAC import LAC os_file_name = os.path.split(os.path.realpath(__file__))[-1] # 加载LAC模型 lac = LAC(mode="lac") # 载入自定义词典 lac.load_customization(f'{root_dir}/data/lac_data/lac_define_dict.txt') def load_stopwords(): """ 加载停用词 :return: list """ stopwords_path = f'{root_dir}/data/lac_data/stop_words.txt' stopwords_increase_path = f'{root_dir}/data/lac_data/stop_words_increase.txt' if stopwords_path: with open(stopwords_path, 'r', encoding='utf-8-sig') as f: stopwords = [line.strip() for line in f] else: Log().info(os_file_name, '《热点事件》|没有停用词表|') stopwords = [] if stopwords_increase_path: with open(stopwords_increase_path, 'r', encoding='utf-8-sig') as f: stopwords_increase = [line.strip() for line in f] stopwords.extend(stopwords_increase) else: Log().info(os_file_name, '《热点事件》|没有停用词增加表|') return stopwords def extract_important_word(content: str): """ LAC分词 :param content: str :return: str """ # 运行lac lac_result = lac.run(content) # 将分词和词性转为dataframe lac_result_df = pd.DataFrame(data=zip(lac_result[1], lac_result[0]), columns=['char', 'word']) # 选取重要词性的dataframe seg_data = list(lac_result_df[(lac_result_df['char'] == 'n') | (lac_result_df['char'] == 'vn') | (lac_result_df['char'] == 'an') | (lac_result_df['char'] == 'nz') | (lac_result_df['char'] == 'ORG') | (lac_result_df['char'] == 'v') | (lac_result_df['char'] == 'LOC')].word) # 获取重要词语,不在停用词内 docu_process = ','.join([word for word in seg_data if len(word) >= 2 and word not in load_stopwords()]) return docu_process if __name__ == "__main__": sentence = "小区物业没人管,垃圾也没人处理" sentence = extract_important_word(sentence) print(sentence)