小程序开发基础: 从零开始打造自己的小程序
979
2022-11-01
TF-IDF算法抽取中文内容的主题关键词
db.ini
# db[db]db_port = 3306db_user = userdb_host = localhostdb_pass = pwddb_database
main.py
# -*-coding:utf-8-*-import MySQLdbimport configparserimport osimport jieba.posseg as psegfrom sklearn.feature_extraction.text import TfidfTransformerfrom sklearn.feature_extraction.text import CountVectorizerimport numpy as npimport sysreload(sys)sys.setdefaultencoding('utf8')PATH = lambda p: os.path.abspath(os.path.join(os.path.dirname(__file__), p))db_file = PATH('db.ini')class IfTdf(object): def init_db(self): dbc = configparser.ConfigParser() dbc.read(db_file) self.conn = MySQLdb.connect( host=dbc.get("db", 'db_host'), user=dbc.get("db", 'db_user'), passwd=dbc.get("db", 'db_pass'), db=dbc.get("db", 'db_database'), port=int(dbc.get("db", 'db_port')), charset='utf8') self.cur = self.conn.cursor(MySQLdb.cursors.DictCursor) def __init__(self): self.conn = None self.cur = None self.init_db() def get_data(self): self.cur.execute("SELECT id, content FROM `table` WHERE 1 ORDER BY `id` DESC LIMIT 1000") return self.cur.fetchall() def get_words(self, data): stop_word = [unicode(line.rstrip()) for line in open(PATH('chinese_stopwords.txt'))] for r in data: content = r['content'].strip().replace('\n', '').replace(' ', '').replace('\t', '').replace('\r', '') seg_list = pseg.cut(content) seg_list_after = [] for seg in seg_list: if seg.word not in stop_word: seg_list_after.append(seg.word) yield ' '.join(seg_list_after) def get_ids(self, data): for r in data: yield '%s %s Topic:\n' % (r['id'], r['content']) def __del__(self): self.cur.close() self.conn.close() print 'Finished!' def main(self): data = self.get_data() list_words = list(self.get_words(data)) list_ids = list(self.get_ids(data)) vectorizer = CountVectorizer() transformer = TfidfTransformer() tfidf = transformer.fit_transform(vectorizer.fit_transform(list_words)) words = vectorizer.get_feature_names() weight = tfidf.toarray() n = 3 for (id, w) in zip(list_ids, weight): print u'{}:'.format(id) loc = np.argsort(-w) for i in range(n): print u'-{}: {} {}'.format(str(i + 1), words[loc[i]], w[loc[i]]) print '\n'
参考:
https://github.com/dongxiexidian/Chinese/tree/master/dicthttp://ruanyifeng.com/blog/2013/03/tf-idf.htmlhttp://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.htmlhttps://zhuanlan.zhihu.com/p/27330205
版权声明:本文内容由网络用户投稿,版权归原作者所有,本站不拥有其著作权,亦不承担相应法律责任。如果您发现本站中有涉嫌抄袭或描述失实的内容,请联系我们jiasou666@gmail.com 处理,核实后本网站将在24小时内删除侵权内容。
发表评论
暂时没有评论,来抢沙发吧~