機械学習

[1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.templates
from matplotlib.ticker import StrMethodFormatter
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelBinarizer

import warnings
warnings.filterwarnings('ignore')
[2]:
# サンプルデータセット:titanic
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')
df_gender = pd.read_csv('data/gender_submission.csv')

データ可視化

[3]:
df_train.head()
[3]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
[4]:
df_gender.head()
[4]:
PassengerId Survived
0 892 0
1 893 1
2 894 0
3 895 0
4 896 1
[5]:
# カラムの確認
df_train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   PassengerId  891 non-null    int64
 1   Survived     891 non-null    int64
 2   Pclass       891 non-null    int64
 3   Name         891 non-null    object
 4   Sex          891 non-null    object
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64
 7   Parch        891 non-null    int64
 8   Ticket       891 non-null    object
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object
 11  Embarked     889 non-null    object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
[6]:
# 欠損の確認
sns.heatmap(df_train.isnull(), cbar=False);
../_images/src_01_01_7_0.png
[7]:
# 生存率の確認
women = df_train.loc[df_train['Sex'] == 'female']['Survived']
rate_women = sum(women)/len(women)
men = df_train.loc[df_train['Sex'] == 'male']['Survived']
rate_men = sum(men)/len(men)

print('% of women who survived:', rate_women)
print('% of men who survived:', rate_men)
% of women who survived: 0.7420382165605095
% of men who survived: 0.18890814558058924
[8]:
data = [df_train, df_test]
for dataset in data:
    mean = df_train['Age'].mean()
    std = df_test['Age'].std()
    is_null = dataset['Age'].isnull().sum()
    # compute random numbers between the mean, std and is_null
    rand_age = np.random.randint(mean - std, mean + std, size = is_null)
    # fill NaN values in Age column with random values generated
    age_slice = dataset['Age'].copy()
    age_slice[np.isnan(age_slice)] = rand_age
    dataset['Age'] = age_slice
    dataset['Age'] = df_train['Age'].astype(int)
[9]:
survived = 'survived'
not_survived = 'not survived'
fig, axes = plt.subplots(nrows=1, ncols=2,figsize=(16, 6))
women = df_train[df_train['Sex']=='female']
men = df_train[df_train['Sex']=='male']
ax = sns.distplot(women[women['Survived']==1].Age.dropna(), bins=18, label = survived, ax = axes[0], kde =False, color='green')
ax = sns.distplot(women[women['Survived']==0].Age.dropna(), bins=40, label = not_survived, ax = axes[0], kde =False, color='red')
ax.legend()
ax.set_title('Female')
ax = sns.distplot(men[men['Survived']==1].Age.dropna(), bins=18, label = survived, ax = axes[1], kde = False, color='green')
ax = sns.distplot(men[men['Survived']==0].Age.dropna(), bins=40, label = not_survived, ax = axes[1], kde = False, color='red')
ax.legend()
_ = ax.set_title('Male');
../_images/src_01_01_10_0.png
[10]:
fig = px.scatter_3d(df_train, x='Name', y='Sex', z='Age', color='Age')
fig.show()

Data type cannot be displayed: application/vnd.plotly.v1+json

plot
[11]:
for template in ['plotly']:
    fig = px.scatter(df_train,
                     x='PassengerId', y='Age', color='Survived',
                     log_x=True, size_max=20,
                     template=template, title='Which Age Survived?')
    fig.show()

Data type cannot be displayed: application/vnd.plotly.v1+json

plot_01
[12]:
sns.barplot(x='Pclass', y='Survived', data=df_train);
../_images/src_01_01_15_0.png
[13]:
plt.rc('xtick', labelsize=14)
plt.rc('ytick', labelsize=14)

plt.figure()
fig = df_train.groupby('Survived')['Pclass'].plot.hist(histtype='bar', alpha=0.8)
plt.legend(('Died','Survived'), fontsize = 12)
plt.xlabel('Pclass', fontsize = 18)
plt.show()
../_images/src_01_01_16_0.png
[14]:
embarked_mode = df_train['Embarked'].mode()
data = [df_train, df_test]
for dataset in data:
    dataset['Embarked'] = dataset['Embarked'].fillna(embarked_mode)
[15]:
FacetGrid = sns.FacetGrid(df_train, row='Embarked', size=4.5, aspect=1.6)
FacetGrid.map(sns.pointplot, 'Pclass', 'Survived', 'Sex', order=None, hue_order=None )
FacetGrid.add_legend();
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
セル19 を /Users/taichinakabeppu/Desktop/GitHub/documents/src/01_01.ipynb in <cell line: 1>()
----> <a href='vscode-notebook-cell:/Users/taichinakabeppu/Desktop/GitHub/documents/src/01_01.ipynb#X24sZmlsZQ%3D%3D?line=0'>1</a> FacetGrid = sns.FacetGrid(df_train, row='Embarked', size=4.5, aspect=1.6)
      <a href='vscode-notebook-cell:/Users/taichinakabeppu/Desktop/GitHub/documents/src/01_01.ipynb#X24sZmlsZQ%3D%3D?line=1'>2</a> FacetGrid.map(sns.pointplot, 'Pclass', 'Survived', 'Sex', order=None, hue_order=None )
      <a href='vscode-notebook-cell:/Users/taichinakabeppu/Desktop/GitHub/documents/src/01_01.ipynb#X24sZmlsZQ%3D%3D?line=2'>3</a> FacetGrid.add_legend()

TypeError: __init__() got an unexpected keyword argument 'size'
[ ]:
sns.countplot( x='Survived', data=df_train, hue='Embarked', palette='Set1');
../_images/src_01_01_19_0.png
[ ]:
data = [df_train, df_test]
for dataset in data:
    dataset['relatives'] = dataset['SibSp'] + dataset['Parch']
    dataset.loc[dataset['relatives'] > 0, 'travelled_alone'] = 'No'
    dataset.loc[dataset['relatives'] == 0, 'travelled_alone'] = 'Yes'
axes = sns.factorplot('relatives','Survived',
                      data=df_train, aspect = 2.5, );
../_images/src_01_01_20_0.png
[ ]:
df_train.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked relatives travelled_alone
0 1 0 3 Braund, Mr. Owen Harris male 22 1 0 A/5 21171 7.2500 NaN S 1 No
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38 1 0 PC 17599 71.2833 C85 C 1 No
2 3 1 3 Heikkinen, Miss. Laina female 26 0 0 STON/O2. 3101282 7.9250 NaN S 0 Yes
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1 0 113803 53.1000 C123 S 1 No
4 5 0 3 Allen, Mr. William Henry male 35 0 0 373450 8.0500 NaN S 0 Yes
[ ]:
fig = px.histogram(df_train, x='SibSp', y='Parch', color='Sex', marginal='rug', hover_data=df_train.columns)
fig.show()

Data type cannot be displayed: application/vnd.plotly.v1+json

plot_02
[ ]:
for template in ['plotly_dark']:
    fig = px.scatter(df_gender,
                     x='PassengerId', y='Survived', color='Survived',
                     log_x=True, size_max=20,
                     template=template, title="Survived or Died?")
    fig.show()

Data type cannot be displayed: application/vnd.plotly.v1+json

plot_03

前処理

連続変数 (Numeric)

[ ]:
# 連続値のみ取得
def create_numeric(input_df):
    use_cols = [
        'Pclass',
        'Age',
        'SibSp',
        'Parch'
    ]

    return input_df[use_cols].copy()

カテゴリ系のカラム

CountEncoding

[1, 1, 1, 2]->[3, 3, 3, 1]
[ ]:
def create_count_encoding(input_df):

    use_columns = [
        'Sex',
        'Embarked'
    ]

    out_df = pd.DataFrame()
    for column in use_columns:
        # df_train を基盤とする
        vc = df_train[column].value_counts()
        out_df[column] = input_df[column].map(vc)

    return out_df.add_prefix('CE_')

OneHotEncoding

['a', 'b', 'a'] ->

[
    [1, 0],
    [0, 1],
    [1, 0]
]
[ ]:
def create_one_hot_encoding(input_df):

    use_columns = [
        'Sex',
        'Embarked'
    ]

    out_df = pd.DataFrame()
    for column in use_columns:

        vc = df_train[column].value_counts()
        # vc = vc[vc > 20] # カテゴリ数が多い場合に使用
        cat = pd.Categorical(input_df[column], categories=vc.index)
        # このタイミングで one-hot 化
        out_i = pd.get_dummies(cat)
        # column が Catgory 型として認識されているので list にして解除する (こうしないと concat でエラーになる)
        out_i.columns = out_i.columns.tolist()
        out_i = out_i.add_prefix(f'{column}=')
        out_df = pd.concat([out_df, out_i], axis=1)
    return out_df

前処理(特徴量エンジニアリング)を関数化

[ ]:
from tqdm import tqdm

def to_feature(input_df):

    processors = [
        create_numeric,
        create_count_encoding,
        create_one_hot_encoding
    ]

    out_df = pd.DataFrame()

    for func in tqdm(processors, total=len(processors)):
        _df = func(input_df)

        # データ数が一致しているか確認(ずれている場合 func の実装がおかしいことがわかる)
        assert len(_df) == len(input_df), func.__name__
        out_df = pd.concat([out_df, _df], axis=1)

    return out_df
[ ]:
df_train_feature = to_feature(df_train)
df_test_feature = to_feature(df_test)
100%|██████████| 3/3 [00:00<00:00, 21.97it/s]
100%|██████████| 3/3 [00:00<00:00, 48.32it/s]

Model : Classification

[ ]:
import lightgbm as lgbm
from sklearn.metrics import roc_auc_score

def fit_lgbm(X,
             y,
             cv,
             params: dict=None,
             verbose: int=500):

    if params is None:
        params = {}

    models = []
    n_records = len(X)
    n_labels = len(np.unique(y))
    oof_pred = np.zeros((n_records, n_labels), dtype=np.float32)

    for i, (idx_train, idx_valid) in enumerate(cv):
        x_train, y_train = X[idx_train], y[idx_train]
        x_valid, y_valid = X[idx_valid], y[idx_valid]

        clf = lgbm.LGBMClassifier(**params)

        clf.fit(x_train, y_train,
                eval_set=[(x_valid, y_valid)],
                early_stopping_rounds=100,
                verbose=verbose)

        pred_i = clf.predict_proba(x_valid)
        oof_pred[idx_valid] = pred_i
        models.append(clf)
        y_pred_label = np.argmax(pred_i, axis=1)
        score = roc_auc_score(y_valid, y_pred_label)
        print(f'Fold {i} ROC-AUC: {score:.4f}')

    oof_label = np.argmax(oof_pred, axis=1)
    score = roc_auc_score(y, oof_label)
    print('-' * 50)
    print(f'FINISHED | Whole ROC-AUC: {score:.4f}')
    return oof_pred, models
[ ]:
params = {
    'objective': 'binary',
    'boosting': 'gbdt',
    'metric': 'binary_logloss',
    'learning_rate': .1,
    'reg_lambda': 1.,
    'reg_alpha': .1,
    'max_depth': 5,
    'n_estimators': 10000,
    'colsample_bytree': .5,
    'min_child_samples': 10,
    'subsample_freq': 3,
    'subsample': .9,
    'importance_type': 'gain',
    'random_state': 71,
}
[ ]:
from sklearn.model_selection import KFold

fold = KFold(n_splits=5, shuffle=True, random_state=71)
y = df_train['Survived']
cv = list(fold.split(df_train_feature, y))

oof, models = fit_lgbm(df_train_feature.values, y, cv, params=params, verbose=500)
[LightGBM] [Warning] boosting is set=gbdt, boosting_type=gbdt will be ignored. Current value: boosting=gbdt
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[52]    valid_0's binary_logloss: 0.374653
Fold 0 ROC-AUC: 0.7890
[LightGBM] [Warning] boosting is set=gbdt, boosting_type=gbdt will be ignored. Current value: boosting=gbdt
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[47]    valid_0's binary_logloss: 0.453736
Fold 1 ROC-AUC: 0.7874
[LightGBM] [Warning] boosting is set=gbdt, boosting_type=gbdt will be ignored. Current value: boosting=gbdt
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[68]    valid_0's binary_logloss: 0.422482
Fold 2 ROC-AUC: 0.8060
[LightGBM] [Warning] boosting is set=gbdt, boosting_type=gbdt will be ignored. Current value: boosting=gbdt
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[24]    valid_0's binary_logloss: 0.468949
Fold 3 ROC-AUC: 0.7511
[LightGBM] [Warning] boosting is set=gbdt, boosting_type=gbdt will be ignored. Current value: boosting=gbdt
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[70]    valid_0's binary_logloss: 0.409224
Fold 4 ROC-AUC: 0.7927
--------------------------------------------------
FINISHED | Whole ROC-AUC: 0.7847
[ ]:
def visualize_importance(models, df_train_feature):
    """lightGBM の model 配列の feature importance を plot する
    CVごとのブレを boxen plot として表現.

    args:
        models:
            List of lightGBM models
        feat_train_df:
            学習時に使った DataFrame
    """
    feature_importance_df = pd.DataFrame()
    for i, model in enumerate(models):
        _df = pd.DataFrame()
        _df['feature_importance'] = model.feature_importances_
        _df['column'] = df_train_feature.columns
        _df['fold'] = i + 1
        feature_importance_df = pd.concat([feature_importance_df, _df],
                                          axis=0, ignore_index=True)

    order = feature_importance_df.groupby('column')\
        .sum()[['feature_importance']]\
        .sort_values('feature_importance', ascending=False).index[:50]

    fig, ax = plt.subplots(figsize=(8, max(6, len(order) * .25)))
    sns.boxenplot(data=feature_importance_df,
                  x='feature_importance',
                  y='column',
                  order=order,
                  ax=ax,
                  palette='viridis',
                  orient='h')
    ax.tick_params(axis='x', rotation=90)
    ax.set_title('Importance')
    ax.grid()
    fig.tight_layout()
    return fig, ax
[ ]:
fig, ax = visualize_importance(models, df_train_feature)
../_images/src_01_01_44_0.png

Model : Regression

[ ]:
from sklearn.datasets import fetch_california_housing
data = fetch_california_housing()
df = pd.DataFrame(data['data'], columns=data['feature_names'])
y = data['target']
[ ]:
from sklearn.model_selection import train_test_split
df_train, df_test, y_train, y_test  = train_test_split(df, y, train_size=0.7, random_state=0)
len(df_train), len(df_test), len(y_train), len(y_test)
(14447, 6193, 14447, 6193)
[ ]:
import lightgbm as lgbm
from sklearn.metrics import mean_squared_error

def fit_lgbm(X,
             y,
             cv,
             params: dict=None,
             verbose: int=500):

    if params is None:
        params = {}

    models = []
    oof_pred = np.zeros_like(y, dtype=np.float)

    for i, (idx_train, idx_valid) in enumerate(cv):
        x_train, y_train = X[idx_train], y[idx_train]
        x_valid, y_valid = X[idx_valid], y[idx_valid]

        clf = lgbm.LGBMRegressor(**params)

        clf.fit(x_train, y_train,
                eval_set=[(x_valid, y_valid)],
                early_stopping_rounds=100,
                verbose=verbose)

        pred_i = clf.predict(x_valid)
        oof_pred[idx_valid] = pred_i
        models.append(clf)
        print(f'Fold {i} RMSE: {mean_squared_error(y_valid, pred_i) ** .5:.4f}')

    score = mean_squared_error(y, oof_pred) ** .5
    print('-' * 50)
    print('FINISHED | Whole RMSE: {:.4f}'.format(score))
    return oof_pred, models
[ ]:
params = {
    'objective': 'rmse',
    'learning_rate': .1,
    'reg_lambda': 1.,
    'reg_alpha': .1,
    'max_depth': 5,
    'n_estimators': 10000,
    'colsample_bytree': .5,
    'min_child_samples': 10,
    'subsample_freq': 3,
    'subsample': .9,
    'importance_type': 'gain',
    'random_state': 71,
}
[ ]:
from sklearn.model_selection import KFold

fold = KFold(n_splits=5, shuffle=True, random_state=71)
cv = list(fold.split(df_train, y_train)) # もともとが generator なため明示的に list に変換する

oof, models = fit_lgbm(df_train.values, y_train, cv, params=params, verbose=500)
Training until validation scores don't improve for 100 rounds
[500]   valid_0's rmse: 0.463818
[1000]  valid_0's rmse: 0.46033
Early stopping, best iteration is:
[984]   valid_0's rmse: 0.459897
Fold 0 RMSE: 0.4599
Training until validation scores don't improve for 100 rounds
[500]   valid_0's rmse: 0.438333
Early stopping, best iteration is:
[584]   valid_0's rmse: 0.437437
Fold 1 RMSE: 0.4374
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[373]   valid_0's rmse: 0.46471
Fold 2 RMSE: 0.4647
Training until validation scores don't improve for 100 rounds
[500]   valid_0's rmse: 0.500634
Early stopping, best iteration is:
[498]   valid_0's rmse: 0.500504
Fold 3 RMSE: 0.5005
Training until validation scores don't improve for 100 rounds
[500]   valid_0's rmse: 0.449492
Early stopping, best iteration is:
[552]   valid_0's rmse: 0.449004
Fold 4 RMSE: 0.4490
--------------------------------------------------
FINISHED | Whole RMSE: 0.4628
[ ]:
fig, ax = visualize_importance(models, df_train)
../_images/src_01_01_51_0.png
[ ]:
# 推論
pred = np.array([model.predict(df_test.values) for model in models])
pred = np.mean(pred, axis=0)
[ ]:
fig, ax = plt.subplots(figsize=(8, 8))
sns.histplot(pred, label='Test Predict', ax=ax, color='black')
sns.histplot(oof, label='Out Of Fold', ax=ax, color='C1')
ax.legend();
../_images/src_01_01_53_0.png