执行《阿里云天池大赛赛题解析——机器学习篇》第二章例程不成功?
执行《阿里云天池大赛赛题解析——机器学习篇》第二章2.2.4章节第3小节例程:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
###import warnings
###warnings.filterwarnings("ignore")
import logging
logging.basicConfig(format='%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s',
level=logging.DEBUG)
logging.debug('debug info')
logging.info('info info')
logging.warning('warning info')
logging.error('error info')
logging.critical('critial info')
##%matplotlib inline
train_data_file = "./zhengqi_train.txt"
test_data_file = "./zhengqi_test.txt"
train_data = pd.read_csv(train_data_file, sep='\t', encoding='utf-8')
test_data = pd.read_csv(test_data_file, sep='\t', encoding='utf-8')
train_data.info()
test_data.info()
test_data.describe()
test_data.describe()
train_data.head()
test_data.head()
fig = plt.figure(figsize=(4, 6)) # 指定绘图对象宽度和高度
sns.boxplot(train_data['V0'],orient="v", width=0.5)
# 画箱式图
column = train_data.columns.tolist()[:39] # 列表头
fig = plt.figure(figsize=(20, 40)) # 指定绘图对象宽度和高度
for i in range(38):
plt.subplot(13, 3, i + 1) # 13行3列子图
sns.boxplot(train_data[column[i]], orient="v", width=0.5) # 箱式图
plt.ylabel(column[i], fontsize=8)
plt.show()
def find_outliers(model, X, y, sigma=3):
# predict y values using model
try:
y_pred = pd.Series(model.predict(X), index=y.index)
# if predicting fails, try fitting the model first
except:
model.fit(X, y)
y_pred = pd.Series(model.predict(X), index=y.index)
# calculate residuals between the model prediction and true y values
resid = y - y_pred
mean_resid = resid.mean()
std_resid = resid.std()
# calculate z statistic, define outliers to be where |z|>sigma
z = (resid - mean_resid) / std_resid
outliers = z[abs(z) > sigma].index
# print and plot the results
print('R2=', model.score(X, y))
print('Mse=', mean_squared_error(y, y_pred))
print('-------------------------------------------------------')
print(len(outliers), 'outliers;', ' ALL data shape:', X.shape)
plt.figure(figsize=(15, 5))
ax_131 = plt.subplot(1, 3, 1)
plt.plot(y, y_pred, '.')
plt.plot(y.loc[outliers], y_pred.loc[outliers], 'ro')
plt.legend(['Accepted', 'Outlier'])
plt.xlabel('y')
plt.ylabel('y_pred');
ax_132 = plt.subplot(1, 3, 2)
plt.plot(y, y - y_pred, '.')
plt.plot(y.loc[outliers], y.loc[outliers] - y_pred.loc[outliers], 'ro')
plt.legend(['Accepted', 'Outlier'])
plt.xlabel('y')
plt.ylabel('y - y_pred');
ax_133 = plt.subplot(1, 3, 3)
z.plot.hist(bins=50, ax=ax_133)
z.loc[outliers].plot.hist(color='r', bins=50, ax=ax_133)
plt.legend(['Accepted', 'Outlier'])
plt.xlabel('z')
plt.savefig('outliers.jpg')
return outliers
# 通过岭回归模型找出异常值,并绘制其分布
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
X_train = train_data.iloc[:,0:-1]
y_train = train_data.iloc[:,-1]
outliers = find_outliers(Ridge(),X_train,y_train)
plt.figure(figsize=(10,5))
ax=plt.subplot(1,2,1)
sns.distplot(train_data['V0'],fit=stats.norm)
###plt.show()
ax=plt.subplot(1,2,2)
res = stats.probplot(train_data['V0'], plot=plt)
plt.show()
'''
train_cols = len(train_data.columns)
plt.figure(figsize=( train_cols, 4 ))
i = 0
for col in train_data.columns:
i += 1
ax = plt.subplot( train_cols , 4, i)
sns.distplot(train_data[col], fit=stats.norm)
i += 1
ax = plt.subplot( train_cols , 4 , i)
res = stats.probplot(train_data[col], plot=plt)
plt.tight_layout()
plt.show()
'''
train_cols = 6
train_rows = len(train_data.columns)
plt.figure(figsize=(4 * train_cols, 4 * train_rows))
i = 0
for col in train_data.columns:
i += 1
ax = plt.subplot(train_rows, train_cols, i)
sns.distplot(train_data[col], fit=stats.norm)
i += 1
ax = plt.subplot(train_rows, train_cols, i)
res = stats.probplot(train_data[col], plot=plt)
plt.show()
不成功,原书中23页上部图并未显示出来,请问该怎么解决?
以上是 执行《阿里云天池大赛赛题解析——机器学习篇》第二章例程不成功? 的全部内容, 来源链接: utcz.com/p/938030.html