python jieba分词

python

#!/usr/bin/python

# -*- coding: UTF-8 -*-

import jieba

import jieba.analyse

import pymysql

id = 1

# shoes.txt中的标签库

tag_ku = []

#精确模式分成的tag

tag = []

# 打开标签库

f = open('D:\spider\shoes.txt','r',encoding='utf-8')

f.seek(0)

#从标签库中读取所有数据,并将每行内容作为一个元素存在data列表里

data = f.read().splitlines()

# 将每行第一个词条即标签读出来

for tag_line in data:

tag_ku.append(tag_line.split(' ')[0])

# jiebashe'zhiz自定义词库

jieba.set_dictionary('./shoes.txt')

# 连接数据库

coon = pymysql.connect(user='root', password='root', host='127.0.0.1', port=3306, database='bishe_shoes',use_unicode=True, charset="utf8")

cursor = coon.cursor()

# 根据id从数据库读取内容

while id <=100000:

print(id)

# 读取商品名称

cursor.execute("select shoes_name from shoes where id ={}".format(id))

shoes_name = cursor.fetchone()[0]

print(shoes_name)

# 对商品名称根据自定义词库精准分词

result = list(jieba.cut(shoes_name, cut_all=False))

result = list(result)

print(result)

shoes_ku = ''

# 对生成的关键词进行过滤,将符合条件的关键词保存到shoes_ku中

for each in result:

if each in tag_ku:

#对一些特殊标签进行处理

if each == 'Massimo':

shoes_ku = shoes_ku + 'Massimo Dutti' + '|'

elif each == 'WHAT':

shoes_ku = shoes_ku + 'WHAT FOR' + '|'

elif each == '男' or each == '男款' or each == '男鞋':

shoes_ku = shoes_ku + '男鞋' + '|'

elif each == '女' or each == '女款' or each == '女鞋':

shoes_ku = shoes_ku + '女鞋' + '|'

elif each == 'Kiss':

shoes_ku = shoes_ku + 'KissKitty' + '|'

elif each == 'URBAN':

shoes_ku = shoes_ku + 'URBAN REVIVO' + '|'

elif each == 'Jimmy':

shoes_ku = shoes_ku + 'Jimmy Choo' + '|'

elif each == 'Inking':

shoes_ku = shoes_ku + 'Inking Pot' + '|'

elif each == 'Miss':

shoes_ku = shoes_ku + 'Miss Sixty' + '|'

elif each == 'Martens':

shoes_ku = shoes_ku + 'Dr.Martens' + '|'

else:

shoes_ku = shoes_ku + each + '|'

print(shoes_ku)

# 将生成的商品的标签保存到数据库中

sql = "update shoes set tag = '{0}' where id = {1}".format(shoes_ku,id)

print(sql)

cursor.execute(sql)

coon.commit()

id = id + 1

cursor.close()

运行结果:

以上是 python jieba分词 的全部内容, 来源链接: utcz.com/z/387369.html

回到顶部