执行《阿里云天池大赛赛题解析——机器学习篇》第二章例程不成功?

执行《阿里云天池大赛赛题解析——机器学习篇》第二章例程不成功?

执行《阿里云天池大赛赛题解析——机器学习篇》第二章2.2.4章节第3小节例程:

import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

import seaborn as sns

from scipy import stats

###import warnings

###warnings.filterwarnings("ignore")

import logging

logging.basicConfig(format='%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s',

level=logging.DEBUG)

logging.debug('debug info')

logging.info('info info')

logging.warning('warning info')

logging.error('error info')

logging.critical('critial info')

##%matplotlib inline

train_data_file = "./zhengqi_train.txt"

test_data_file = "./zhengqi_test.txt"

train_data = pd.read_csv(train_data_file, sep='\t', encoding='utf-8')

test_data = pd.read_csv(test_data_file, sep='\t', encoding='utf-8')

train_data.info()

test_data.info()

test_data.describe()

test_data.describe()

train_data.head()

test_data.head()

fig = plt.figure(figsize=(4, 6)) # 指定绘图对象宽度和高度

sns.boxplot(train_data['V0'],orient="v", width=0.5)

# 画箱式图

column = train_data.columns.tolist()[:39] # 列表头

fig = plt.figure(figsize=(20, 40)) # 指定绘图对象宽度和高度

for i in range(38):

plt.subplot(13, 3, i + 1) # 13行3列子图

sns.boxplot(train_data[column[i]], orient="v", width=0.5) # 箱式图

plt.ylabel(column[i], fontsize=8)

plt.show()

def find_outliers(model, X, y, sigma=3):

# predict y values using model

try:

y_pred = pd.Series(model.predict(X), index=y.index)

# if predicting fails, try fitting the model first

except:

model.fit(X, y)

y_pred = pd.Series(model.predict(X), index=y.index)

# calculate residuals between the model prediction and true y values

resid = y - y_pred

mean_resid = resid.mean()

std_resid = resid.std()

# calculate z statistic, define outliers to be where |z|>sigma

z = (resid - mean_resid) / std_resid

outliers = z[abs(z) > sigma].index

# print and plot the results

print('R2=', model.score(X, y))

print('Mse=', mean_squared_error(y, y_pred))

print('-------------------------------------------------------')

print(len(outliers), 'outliers;', ' ALL data shape:', X.shape)

plt.figure(figsize=(15, 5))

ax_131 = plt.subplot(1, 3, 1)

plt.plot(y, y_pred, '.')

plt.plot(y.loc[outliers], y_pred.loc[outliers], 'ro')

plt.legend(['Accepted', 'Outlier'])

plt.xlabel('y')

plt.ylabel('y_pred');

ax_132 = plt.subplot(1, 3, 2)

plt.plot(y, y - y_pred, '.')

plt.plot(y.loc[outliers], y.loc[outliers] - y_pred.loc[outliers], 'ro')

plt.legend(['Accepted', 'Outlier'])

plt.xlabel('y')

plt.ylabel('y - y_pred');

ax_133 = plt.subplot(1, 3, 3)

z.plot.hist(bins=50, ax=ax_133)

z.loc[outliers].plot.hist(color='r', bins=50, ax=ax_133)

plt.legend(['Accepted', 'Outlier'])

plt.xlabel('z')

plt.savefig('outliers.jpg')

return outliers

# 通过岭回归模型找出异常值,并绘制其分布

from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

X_train = train_data.iloc[:,0:-1]

y_train = train_data.iloc[:,-1]

outliers = find_outliers(Ridge(),X_train,y_train)

plt.figure(figsize=(10,5))

ax=plt.subplot(1,2,1)

sns.distplot(train_data['V0'],fit=stats.norm)

###plt.show()

ax=plt.subplot(1,2,2)

res = stats.probplot(train_data['V0'], plot=plt)

plt.show()

'''

train_cols = len(train_data.columns)

plt.figure(figsize=( train_cols, 4 ))

i = 0

for col in train_data.columns:

i += 1

ax = plt.subplot( train_cols , 4, i)

sns.distplot(train_data[col], fit=stats.norm)

i += 1

ax = plt.subplot( train_cols , 4 , i)

res = stats.probplot(train_data[col], plot=plt)

plt.tight_layout()

plt.show()

'''

train_cols = 6

train_rows = len(train_data.columns)

plt.figure(figsize=(4 * train_cols, 4 * train_rows))

i = 0

for col in train_data.columns:

i += 1

ax = plt.subplot(train_rows, train_cols, i)

sns.distplot(train_data[col], fit=stats.norm)

i += 1

ax = plt.subplot(train_rows, train_cols, i)

res = stats.probplot(train_data[col], plot=plt)

plt.show()

不成功,原书中23页上部图并未显示出来,请问该怎么解决?

以上是 执行《阿里云天池大赛赛题解析——机器学习篇》第二章例程不成功? 的全部内容, 来源链接: utcz.com/p/938030.html

回到顶部