python jieba分词
#!/usr/bin/python# -*- coding: UTF-8 -*-
import jieba
import jieba.analyse
import pymysql
id = 1
# shoes.txt中的标签库
tag_ku = []
#精确模式分成的tag
tag = []
# 打开标签库
f = open('D:\spider\shoes.txt','r',encoding='utf-8')
f.seek(0)
#从标签库中读取所有数据,并将每行内容作为一个元素存在data列表里
data = f.read().splitlines()
# 将每行第一个词条即标签读出来
for tag_line in data:
tag_ku.append(tag_line.split(' ')[0])
# jiebashe'zhiz自定义词库
jieba.set_dictionary('./shoes.txt')
# 连接数据库
coon = pymysql.connect(user='root', password='root', host='127.0.0.1', port=3306, database='bishe_shoes',use_unicode=True, charset="utf8")
cursor = coon.cursor()
# 根据id从数据库读取内容
while id <=100000:
print(id)
# 读取商品名称
cursor.execute("select shoes_name from shoes where id ={}".format(id))
shoes_name = cursor.fetchone()[0]
print(shoes_name)
# 对商品名称根据自定义词库精准分词
result = list(jieba.cut(shoes_name, cut_all=False))
result = list(result)
print(result)
shoes_ku = ''
# 对生成的关键词进行过滤,将符合条件的关键词保存到shoes_ku中
for each in result:
if each in tag_ku:
#对一些特殊标签进行处理
if each == 'Massimo':
shoes_ku = shoes_ku + 'Massimo Dutti' + '|'
elif each == 'WHAT':
shoes_ku = shoes_ku + 'WHAT FOR' + '|'
elif each == '男' or each == '男款' or each == '男鞋':
shoes_ku = shoes_ku + '男鞋' + '|'
elif each == '女' or each == '女款' or each == '女鞋':
shoes_ku = shoes_ku + '女鞋' + '|'
elif each == 'Kiss':
shoes_ku = shoes_ku + 'KissKitty' + '|'
elif each == 'URBAN':
shoes_ku = shoes_ku + 'URBAN REVIVO' + '|'
elif each == 'Jimmy':
shoes_ku = shoes_ku + 'Jimmy Choo' + '|'
elif each == 'Inking':
shoes_ku = shoes_ku + 'Inking Pot' + '|'
elif each == 'Miss':
shoes_ku = shoes_ku + 'Miss Sixty' + '|'
elif each == 'Martens':
shoes_ku = shoes_ku + 'Dr.Martens' + '|'
else:
shoes_ku = shoes_ku + each + '|'
print(shoes_ku)
# 将生成的商品的标签保存到数据库中
sql = "update shoes set tag = '{0}' where id = {1}".format(shoes_ku,id)
print(sql)
cursor.execute(sql)
coon.commit()
id = id + 1
cursor.close()
运行结果:
以上是 python jieba分词 的全部内容, 来源链接: utcz.com/z/387369.html