BERT による特徴量エンジニアリング

[1]:
# !pip install -q transformers
[2]:
import numpy as np
import pandas as pd
import torch
import transformers

from transformers import BertTokenizer
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns
import japanize_matplotlib
from sklearn.cluster import KMeans

transformers.__version__
[2]:
'4.9.2'
[3]:
# dummy data
data = {
    'text': [
        '翻訳タスクにおいて、Seq2seq(RNNベースEncoder-Decoderモデル)よりも早くて精度が高い。',
        'RNNもCNNも使わずに Attentionのみを使用したEncoder-Decoderモデルで計算量も精度も改善した。'
    ]
}
df = pd.DataFrame(data)
df
[3]:
text
0 翻訳タスクにおいて、Seq2seq(RNNベースEncoder-Decoderモデル)よりも...
1 RNNもCNNも使わずに Attentionのみを使用したEncoder-Decoderモデ...
[4]:
# 前処理
# def cleaning(x):
#     return x.replace('\u3000', '').replace('■', '').replace('   ', '').replace('\n', '').replace(' ', '').replace('【', '').replace('】', '')
# df['message'] = df['massage'].aaply(lambda x: cleaning(x))
[5]:
# BERT
class BertSequenceVectorizer:
    def __init__(self):
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.model_name = 'cl-tohoku/bert-base-japanese'
        self.tokenizer = BertTokenizer.from_pretrained(self.model_name)
        self.bert_model = transformers.BertModel.from_pretrained(self.model_name)
        self.bert_model = self.bert_model.to(self.device)
        self.max_len = 128

    def vectorize(self, sentence : str) -> np.array:
        inp = self.tokenizer.encode(sentence)
        len_inp = len(inp)

        if len_inp >= self.max_len:
            inputs = inp[:self.max_len]
            masks = [1] * self.max_len
        else:
            inputs = inp + [0] * (self.max_len - len_inp)
            masks = [1] * len_inp + [0] * (self.max_len - len_inp)

        inputs_tensor = torch.tensor([inputs], dtype=torch.long).to(self.device)
        masks_tensor = torch.tensor([masks], dtype=torch.long).to(self.device)

        bert_out = self.bert_model(inputs_tensor, masks_tensor)
        seq_out, pooled_out = bert_out['last_hidden_state'], bert_out['pooler_output']

        if torch.cuda.is_available():
            return seq_out[0][0].cpu().detach().numpy() # 0番目は [CLS] token, 768 dim の文章特徴量
        else:
            return seq_out[0][0].detach().numpy()
[6]:
def to_bert_feature(col:str)->pd.DataFrame:
    BSV = BertSequenceVectorizer()
    feature_df = pd.DataFrame()
    feature_df['description'] = df[col]
    feature_df['description_feature'] = df[col].apply(lambda x : BSV.vectorize(x))
    return feature_df
[7]:
# 実行
feature_df = to_bert_feature(col='text')
feature_df
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization.
The tokenizer class you load from this checkpoint is 'BertJapaneseTokenizer'.
The class this function is called from is 'BertTokenizer'.
Some weights of the model checkpoint at cl-tohoku/bert-base-japanese were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[7]:
description description_feature
0 翻訳タスクにおいて、Seq2seq(RNNベースEncoder-Decoderモデル)よりも... [0.07872914, -0.04343322, -0.26853547, -0.7479...
1 RNNもCNNも使わずに Attentionのみを使用したEncoder-Decoderモデ... [0.037794013, 0.14948744, -0.37833312, -0.1988...
[8]:
# 768 のベクトルのリストを行列に変換
def to_matrix(feature:pd.DataFrame)->pd.DataFrame:
    out_df = pd.DataFrame()
    for feature in feature['description_feature']:
        df_ = pd.DataFrame(feature)
        out_df = pd.concat([out_df, df_], axis=1)
    out_df = out_df.T
    out_df.index = range(len(out_df))
    return out_df
[9]:
feature = to_matrix(feature_df)
feature
[9]:
0 1 2 3 4 5 6 7 8 9 ... 758 759 760 761 762 763 764 765 766 767
0 0.078729 -0.043433 -0.268535 -0.747998 0.217914 0.291106 -0.106979 0.034716 -0.213352 -0.271663 ... -0.120724 0.280671 0.696698 0.224248 0.154533 -0.233368 0.141323 -0.076930 0.015057 0.416093
1 0.037794 0.149487 -0.378333 -0.198880 0.304334 0.302627 0.075751 -0.335357 -0.009018 -0.043352 ... 0.359600 -0.170098 0.252657 0.150515 0.506875 -0.511551 0.392957 -0.286956 0.241766 0.346306

2 rows × 768 columns