#!/usr/bin/env python # -*-coding:utf8-*- importos import codecs fromsklearn.feature_extraction.text import TfidfVectorizer importnltk fromsklearn.naive_bayes import MultinomialNB fromsklearn.linear_model import SGDClassifier def tokenize(text): tokens = nltk.word_tokenize(text) # stems = stem_tokens(tokens, stemmer) return tokens defread_corpus(topics): print "------------start-------------" token_dict = dict() y_train = [] fori in range(6): dROOT_SUB = u'./data/topic_corpus_cut/' + topics[i].decode("utf8") + '/' count = 0 forsubdir, dirs, files in os.walk(dROOT_SUB): for file in files: file_path = subdir + os.path.sep + file shakes = codecs.open(file_path, "r", "utf-8") text = shakes.read() token_dict[file] = text # no_punctuation count = count + 1 y_train.extend([i] * count) token_dict_keys = token_dict.keys() returntoken_dict, y_train # def train deftrain_model(token_dict): # this can take some time tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words=None, max_features=400) tfs = tfidf.fit_transform(token_dict.values()) printtfs.shape returntfs, tfidf if __name__ == '__main__': dROOT = u'./data/topics/' topics = ['体育', '社会', '管理'] token_dict, y_train = read_corpus(topics) X_train, tfidf = train_model(token_dict) parameters = { 'loss': 'hinge', 'penalty': 'l2', 'n_iter': 50, 'alpha': 0.00001, 'fit_intercept': True, } #parameters = {'alpha': 0.01} #clf = MultinomialNB(**parameters).fit(X_train, y_train) clf = SGDClassifier(**parameters).fit(X_train, y_train) #X_test_str = u'政府采购好事方向应该支持运行急待改进完善提高专业性数额急需采购东西放权专业技术人员采购人员专业手续繁杂东西差价时间影响工作采购也许腐败' X_test = tfidf.transform([X_test_str]) pred = clf.predict(X_test) printpred 本文来源:https://www.wddqw.com/doc/4a0870e7fc4ffe473268ab59.html