文本分类python代码

时间:2022-07-13 22:18:15 阅读: 最新文章 文档下载
说明:文章内容仅供预览,部分内容可能不全。下载后的文档,内容与下面显示的完全一致。下载之前请确认下面内容是否您想要的,是否完整无缺。
#!/usr/bin/env python # -*-coding:utf8-*-

importos

import codecs

fromsklearn.feature_extraction.text import TfidfVectorizer importnltk

fromsklearn.naive_bayes import MultinomialNB fromsklearn.linear_model import SGDClassifier

def tokenize(text):

tokens = nltk.word_tokenize(text)

# stems = stem_tokens(tokens, stemmer) return tokens

defread_corpus(topics):

print "------------start-------------" token_dict = dict() y_train = [] fori in range(6):

dROOT_SUB = u'./data/topic_corpus_cut/' + topics[i].decode("utf8") + '/' count = 0

forsubdir, dirs, files in os.walk(dROOT_SUB): for file in files:

file_path = subdir + os.path.sep + file

shakes = codecs.open(file_path, "r", "utf-8") text = shakes.read()

token_dict[file] = text # no_punctuation count = count + 1

y_train.extend([i] * count)

token_dict_keys = token_dict.keys() returntoken_dict, y_train

# def train

deftrain_model(token_dict): # this can take some time

tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words=None, max_features=400) tfs = tfidf.fit_transform(token_dict.values()) printtfs.shape returntfs, tfidf

if __name__ == '__main__': dROOT = u'./data/topics/'

topics = ['体育', '社会', '管理']


token_dict, y_train = read_corpus(topics) X_train, tfidf = train_model(token_dict)

parameters = {

'loss': 'hinge', 'penalty': 'l2', 'n_iter': 50,

'alpha': 0.00001, 'fit_intercept': True, }

#parameters = {'alpha': 0.01}

#clf = MultinomialNB(**parameters).fit(X_train, y_train) clf = SGDClassifier(**parameters).fit(X_train, y_train)

#X_test_str = u'政府采购好事方向应该支持运行急待改进完善提高专业性数额急需采购东西放权专业技术人员采购人员专业手续繁杂东西差价时间影响工作采购也许腐败'

X_test = tfidf.transform([X_test_str]) pred = clf.predict(X_test) printpred


本文来源:https://www.wddqw.com/doc/4a0870e7fc4ffe473268ab59.html