import numpy as np
import pandas as pd


data = pd.read_csv('./data/data.csv')


data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             569 non-null    float64
 15  area_se                  569 non-null    float64
 16  smoothness_se            569 non-null    float64
 17  compactness_se           569 non-null    float64
 18  concavity_se             569 non-null    float64
 19  concave points_se        569 non-null    float64
 20  symmetry_se              569 non-null    float64
 21  fractal_dimension_se     569 non-null    float64
 22  radius_worst             569 non-null    float64
 23  texture_worst            569 non-null    float64
 24  perimeter_worst          569 non-null    float64
 25  area_worst               569 non-null    float64
 26  smoothness_worst         569 non-null    float64
 27  compactness_worst        569 non-null    float64
 28  concavity_worst          569 non-null    float64
 29  concave points_worst     569 non-null    float64
 30  symmetry_worst           569 non-null    float64
 31  fractal_dimension_worst  569 non-null    float64
 32  Unnamed: 32              0 non-null      float64
dtypes: float64(31), int64(1), object(1)
memory usage: 146.8+ KB


data = data.drop(columns=['Unnamed: 32'])


data.diagnosis.value_counts(normalize=True)

B    0.627417
M    0.372583
Name: diagnosis, dtype: float64


from sklearn.model_selection import train_test_split


x_train, x_test, y_train, y_test = train_test_split(data[[col for col in data.columns if col not in ['id','diagnosis']]],
                                                    data['diagnosis'], test_size = 0.33)


import autosklearn.classification


%%time
automl = autosklearn.classification.AutoSklearnClassifier(time_left_for_this_task = 300, tmp_folder = "./log/") # 5분 탐색을 하도록 해보자
automl.fit(x_train, y_train)

CPU times: user 1min 43s, sys: 1.87 s, total: 1min 45s
Wall time: 5min 6s

AutoSklearnClassifier(per_run_time_limit=30, time_left_for_this_task=300,
                      tmp_folder='./log/')


automl.leaderboard()


print(automl.sprint_statistics())

auto-sklearn results:
  Dataset name: 64dbb92e-f527-11ec-b864-48df37344624
  Metric: accuracy
  Best validation score: 0.992063
  Number of target algorithm runs: 84
  Number of successful target algorithm runs: 82
  Number of crashed target algorithm runs: 1
  Number of target algorithms that exceeded the time limit: 1
  Number of target algorithms that exceeded the memory limit: 0


len(automl.get_models_with_weights())

31


for idx, i in enumerate(automl.show_models()):
    print('model_id :',automl.show_models()[i]['model_id'])
    print('rank :',automl.show_models()[i]['rank'])
    print('cost :',automl.show_models()[i]['cost'])
    print('ensemble_weight :',automl.show_models()[i]['ensemble_weight'])
    print('sklearn_classifier :',automl.show_models()[i]['sklearn_classifier'])
    print('')
    
    if idx > 4:
        break

model_id : 81
rank : 1
cost : 0.007936507936507908
ensemble_weight : 0.06
sklearn_classifier : MLPClassifier(alpha=0.0005147098794375924, beta_1=0.999, beta_2=0.9,
              early_stopping=True, hidden_layer_sizes=(115, 115),
              learning_rate_init=0.00047552668954983654, max_iter=64,
              n_iter_no_change=32, random_state=1, verbose=0, warm_start=True)

model_id : 6
rank : 2
cost : 0.015873015873015928
ensemble_weight : 0.04
sklearn_classifier : MLPClassifier(alpha=0.0017940473175767063, beta_1=0.999, beta_2=0.9,
              early_stopping=True, hidden_layer_sizes=(101, 101),
              learning_rate_init=0.0004684917334431039, max_iter=32,
              n_iter_no_change=32, random_state=1, verbose=0, warm_start=True)

model_id : 51
rank : 3
cost : 0.015873015873015928
ensemble_weight : 0.02
sklearn_classifier : MLPClassifier(activation='tanh', alpha=0.0002934547032292889, beta_1=0.999,
              beta_2=0.9, early_stopping=True,
              hidden_layer_sizes=(180, 180, 180),
              learning_rate_init=0.00031374954092369304, max_iter=32,
              n_iter_no_change=32, random_state=1, verbose=0, warm_start=True)

model_id : 70
rank : 4
cost : 0.015873015873015928
ensemble_weight : 0.02
sklearn_classifier : MLPClassifier(activation='tanh', alpha=0.004195177650800112, beta_1=0.999,
              beta_2=0.9, early_stopping=True, hidden_layer_sizes=(187,),
              learning_rate_init=0.000258112591010141, max_iter=32,
              n_iter_no_change=32, random_state=1, verbose=0, warm_start=True)

model_id : 3
rank : 5
cost : 0.023809523809523836
ensemble_weight : 0.02
sklearn_classifier : MLPClassifier(activation='tanh', alpha=0.0001363185819149026, beta_1=0.999,
              beta_2=0.9, early_stopping=True,
              hidden_layer_sizes=(115, 115, 115),
              learning_rate_init=0.00018009776276177523, max_iter=32,
              n_iter_no_change=32, random_state=1, verbose=0, warm_start=True)

model_id : 48
rank : 6
cost : 0.023809523809523836
ensemble_weight : 0.04
sklearn_classifier : MLPClassifier(activation='tanh', alpha=0.0195616022441711, beta_1=0.999,
              beta_2=0.9, early_stopping=True,
              hidden_layer_sizes=(200, 200, 200),
              learning_rate_init=0.07346636174243101, max_iter=32,
              n_iter_no_change=32, random_state=1, verbose=0, warm_start=True)


from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix


model = automl

print(y_test.reset_index().groupby('diagnosis').size())
y_predict = model.predict(x_test)
y_predict_score = model.predict_proba(x_test)[:, 1]

diagnosis
B    118
M     70
dtype: int64


print("Accuracy : {},\nAUC : {},\nPrecision : {},\nRecall : {}".format(
    accuracy_score(y_test, y_predict),
    roc_auc_score(y_test, y_predict_score),
    precision_score(y_test, y_predict, pos_label='B'),
    recall_score(y_test, y_predict, pos_label='B')
))

Accuracy : 0.9787234042553191,
AUC : 0.9966101694915254,
Precision : 0.975,
Recall : 0.9915254237288136


confusion_matrix(y_test, y_predict)

array([[117,   1],
       [  3,  67]])

	rank	ensemble_weight	type	cost	duration
model_id
81	1	0.06	mlp	0.007937	1.163223
6	2	0.04	mlp	0.015873	1.380437
70	3	0.02	mlp	0.015873	4.388834
51	4	0.02	mlp	0.015873	1.530032
3	5	0.02	mlp	0.023810	1.361230
75	6	0.04	mlp	0.023810	2.225231
67	7	0.04	mlp	0.023810	2.053632
61	8	0.02	sgd	0.023810	0.904623
54	9	0.06	mlp	0.023810	0.947851
48	10	0.04	mlp	0.023810	1.612731
84	11	0.04	mlp	0.031746	2.060007
71	12	0.04	mlp	0.031746	1.586483
28	13	0.02	mlp	0.031746	1.406765
45	14	0.02	mlp	0.039683	1.283361
10	15	0.02	extra_trees	0.039683	1.588807
41	16	0.02	mlp	0.047619	1.629064
77	17	0.06	random_forest	0.047619	1.730700
50	18	0.02	mlp	0.047619	1.497629
76	19	0.06	extra_trees	0.047619	1.584902
17	20	0.02	mlp	0.047619	4.385568
82	21	0.04	gaussian_nb	0.055556	0.742644
14	22	0.02	mlp	0.055556	1.593996
12	23	0.02	gradient_boosting	0.055556	1.298581
8	24	0.04	random_forest	0.063492	1.842520
2	25	0.02	random_forest	0.063492	1.606486
38	26	0.02	adaboost	0.063492	1.166562
40	27	0.02	extra_trees	0.063492	1.816874
9	28	0.04	random_forest	0.071429	2.322743
25	29	0.02	adaboost	0.071429	1.926036
49	30	0.06	gradient_boosting	0.071429	1.057095
13	31	0.02	random_forest	0.071429	1.894673

[PyTorch] DataLoader를 사용하기 위한 Dataset 생성하기 (0)	2023.08.08
[PyTorch] 이미지 Classification - CNN (1) CIFAR10 dataset (0)	2023.02.20
[Snowflake] data insert 하는 방법 (1) snowlight worksheet SQL (0)	2023.01.06
[Pytorch] torchtext로 텍스트 classification (0)	2022.07.13

w

티스토리 뷰

[AutoML] auto-sklearn classification example

1. 데이터셋 확인, 전처리¶

1) 컬럼, 결측값 확인¶

2) 불필요한 컬럼 삭제¶

3) 데이터셋 분포 확인¶

2. train / test split¶

3. auto-sklearn¶

1) auto-sklearn 으로 classification model 생성하기¶

파라미터¶

모델학습¶

2) auto-sklearn가 찾은 모델, cost 확인¶

4. 성능평가¶

'개발' 카테고리의 다른 글

티스토리툴바

« 2025/05 »
일	월	화	수	목	금	토
				1	2	3
4	5	6	7	8	9	10
11	12	13	14	15	16	17
18	19	20	21	22	23	24
25	26	27	28	29	30	31