GridSearchCV-sklearn

决策树调参

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65

import numpy as np
import matplotlib.pyplot as plt

## 加载iris数据集
from sklearn.datasets import load_iris
iris = load_iris()
X,y = iris.data[:,:2], iris.target  #只取前2个特征

## 分割测试集与训练集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, stratify = y, random_state= 42)

## 适配决策树,并计算准确率
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
y_pred = dtc.predict(X_test)
accuracy_score(y_test, y_pred)

## 画出决策树示意图
from sklearn.tree import export_graphviz
from io import StringIO
from IPython.display import Image
import pydot

def create_png(clf):
    dot_iris = StringIO()
    export_graphviz(clf.out_file = dot_iris, feature_name = iris.feature_names[:2], filled = True)
    graphs = pydot.graph_from_dot_data(dot_iris.getvalue())
    return graphs[0].create_png()

Image(create_png(dtc))

## 用GridSearchCV搜索最佳参数
from sklearn.model_selection import GridSearchCV
dtc = DecisionTreeClassifier()
grid = {'criterion': ['gini','entropy'],
        'max_depth': [3,5,7,9,20]
        }
gs = GridSearchCV(dtc, param_grid=grid, cv=5)
gs.fit(X_train, y_train)

## 查看网格搜索得到的模型的准确率
accuracy_score(y_test, gs.predict(X_test))

## 查看搜索结果
gs.cv_results_

gs.best_estimator_

## 查看网格搜索得到的决策树示意图
Image(create_png(gs.best_estimator_))


## 探索最大深度对决策树性能的影响
grid=['max_depth':range(3,50)]
gs2 = GridSearchCV(dtc, param_grid=grid, cv=5)
gs2.fit(X_train, y_train)

gs2.cv_results_['mean_test_score']

plt.plot(range(3,50), gs2.cv_results_['mean_test_score'])

随机森林-【机器学习】【sklearn】网格搜索GridSearchCV

1. 加载数据

from sklearn.datasets import load_wine

wine = load_wine() X = wine.data y = wine.target

from sklearn.ensemble import RandomForestClassifier rfc = RandomForestClassifier()

2. 网格搜索找出最优参数

param_grid = {“n_estimator”:np.arange(10,201,10), “max_features”:np.arange(0.1, 1, 0.1), “max_depth”: np.arange(3,13), “bootstrap”: [True, False] } #定义字典,设置参数的可取值

from sklearn.model_selection import GridSearchCV

myGrid = GridSearchCV(rfc, param_grid=param_grid, cv=5) #构造网格搜索,内置k折交叉验证

myGrid.fit(X,y) #训练

print( myGrid.best_params_, #最优参数组合 myGrid.best_score_, myGrid.best_estimator_, #最优模型 myGrid.best_index_ )

输出最优参数

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
from sklearn import metrics
best_parameters = dict()
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print "\t%s: %r" % (param_name, best_parameters[param_name])
pipeline.set_params(clf__alpha = 1e-05,  
                    tfidf__use_idf = True,
                    vect__max_df = 0.5,
                    vect__max_features = None)
pipeline.fit(X_train, y_train)
pred = pipeline.predict(X_test)

3. 使用最优模型做分类

….