决策树缺失值处理

coding

缺失值算是决策树里处理起来比较麻烦的了,其他简单的我就不发布了。

# encoding:utf-8

from__future__import division

__author__ = 'HP'

import copy

import math

import numpy as np

import pandas as pd

from collections import Counter

from sklearn.preprocessing import LabelEncoder

################################

# id3

# 离散属性

# 多分类

# 多重字典记录学习规则

# 非递归

# 深度优先

# 预剪枝

### 缺失值处理

# 解决两个问题

# 如何进行划分属性选择,缺失值如何处理

# 如何进行样本划分,缺失值对应的样本如何划分

################################

''' 缺失值处理

1. 如何进行属性选择

a. 第一次选择划分属性时,样本等权重,均为1,找出未缺失的样本集,计算该样本集的信息增益 和 该样本集的占比,两者相乘即为真正的信息增益

. 注意这时计算占比,就是数个数,因为权重都是1

. 计算信息增益时,P也是数个数

b. 后面选择划分属性时,样本不等权重,找出未缺失的样本集,计算该样本集的信息增益 和 该样本集的占比,两者相乘即为真正的信息增益

. 此时样本权重不全为1

. 计算占比时不是数个数,而是求权重和

. 计算信息增益的P时,也是求权重和

2. 如何划分节点

a. 未缺失按照正常方法划分,权重都为1

b. 缺失值划到所有子集当中,权重不为1, 而是该属性值占未缺失的样本集的比例

'''

def mydata():

data = pd.read_csv('xg3.txt',index_col=[0], encoding='gbk')

data[[-1]] = data.apply(lambda x:x[-1].strip(), axis=1)

# print(data)

# print(pd.get_dummies(data[[0]]))

data.columns = range(9)

# print(data)

encode_str = LabelEncoder()

str_cols = [0, 1, 2, 3, 4, 5, 8]

for i in str_cols:

data[[i]] = encode_str.fit_transform(data[[i]])

return data.values

def get_label(labels):

count_label = Counter(labels)

key = None

sum = 0

for label, count in count_label.items():

if count > sum:

sum = count

key = label

return key

def entropy(attr):

# 信息熵

attr_values_count = Counter(attr)

attr_len = len(attr)

sum = 0

for i in attr_values_count.values():

sum += -1 * i / attr_len * math.log(i / attr_len, 2)

return sum

def gain_queshi_equal_weight(attr, label):

# 缺失属性的信息增益,用于初次划分,初次划分样本权重都为1

index_nan = np.isnan(attr)

index_nonan = np.where(attr>=0)

# 未缺失属性及标签

attr_new = attr[index_nonan]

label_new = label[index_nonan]

# 未缺失样本数

count_nonan = label_new.shape[0]

# 未缺失占比

zhanbi = attr_new.shape[0]/attr.shape[0]

# 未缺失的原始熵

ori_entropy = entropy(label_new)

# 未缺失的新熵

new_entropy = 0

for key, count in Counter(attr_new).items():

# 未缺失中属性值为key的占比 * key对应的样本集的熵

new_entropy += count/count_nonan * entropy(label_new[np.where(attr_new == key)])

# 信息增益

gain = zhanbi * (ori_entropy - new_entropy)

return gain

def split_node_queshi(node, attr_split):

# 属性有缺失值的样本划分

index_nan = np.isnan(node[:,attr_split])

index_nonan = np.where(node[:,attr_split]>=0)

# 未缺失属性值对应的样本集

node_new = node[index_nonan]

# 缺失属性值对应的样本集

sample_queshi = node[index_nan]

# 未缺失样本大小

count_nonan = node_new.shape[0]

### 对该样本集进行划分

# 未缺失的划分 [属性值,样本集,样本占比]

split = []

for key, node_child in pd.DataFrame(node_new).groupby(attr_split):

# 属性值为key的样本在未缺失样本中占比

zhanbi_key = round(len(node_child) / count_nonan, 3)

# 未缺失样本权重为1

weight = [1] * len(node_child)

# 添加缺失样本

node_child = np.vstack((node_child.values, sample_queshi))

# 缺失样本权重

weight.extend([zhanbi_key] * len(sample_queshi))

split.append([key, node_child, np.array(weight)])

return split

def entropy_no_equal_weight(attr, weight):

# 样本不等权重的信息熵

sum = 0

sum_weight = np.sum(weight)

for key in Counter(attr).keys():

index = np.where(attr==key)

zhanbi = np.sum(weight[index]) / sum_weight

sum += -1 * zhanbi * math.log(zhanbi, 2)

return sum

def gain_queshi_no_equal_weight(attr, weight, label):

# 缺失属性的信息增益,样本权重不相等,用于第一次之后的属性选择

index_nan = np.isnan(attr)

index_nonan = np.where(attr>=0)

# 未缺失的属性/标签/权重

attr_new = attr[index_nonan]

label_new = label[index_nonan]

weight_new = weight[index_nonan]

# 未缺失对应的样本占比

zhanbi = np.sum(weight_new) / np.sum(weight)

### 未缺失对应的信息增益

# 未缺失对应的原始熵

ori_entropy = entropy_no_equal_weight(label_new, weight_new)

# 未缺失的新熵

new_entropy = 0

for key in Counter(attr_new).keys():

index_key = np.where(attr_new==key)

label_key = label_new[index_key]

weight_key = weight_new[index_key]

new_entropy += len(label_key) / len(label_new) * entropy_no_equal_weight(label_key, weight_key)

# 信息增益

gain = zhanbi * (ori_entropy - new_entropy)

return gain

if__name__ == '__main__':

data = mydata()

# 离散型样本

data = data[:,[0,1,2,3,4,5,8]]

data[0, 0] = None

data[4, 0] = None

data[12, 0] = None

data[7, 3] = None

data[9, 3] = None

print(data)

# 缺失属性的信息增益 样本等权重

for i in range(data.shape[1]):

print gain_queshi_equal_weight(data[:,i], data[:,-1])

# 缺失值属性的样本划分

split = split_node_queshi(data, 3)

print(split)

# 缺失属性的信息增益 样本不等权重

# weight = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1/3, 1/3])

# gain_queshi_no_equal_weight(data[:,0], weight, data[:,-1])

# 以色泽为例

gain = gain_queshi_no_equal_weight(split[2][1][:,0], split[2][2],split[2][1][:,-1])

print(gain)

以上是 决策树缺失值处理 的全部内容, 来源链接: utcz.com/z/508813.html

回到顶部