{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# BERT による特徴量エンジニアリング" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# !pip install -q transformers" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'4.9.2'" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import numpy as np\n", "import pandas as pd\n", "import torch\n", "import transformers\n", "\n", "from transformers import BertTokenizer\n", "from tqdm import tqdm\n", "import warnings\n", "warnings.filterwarnings('ignore')\n", "\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "import japanize_matplotlib\n", "from sklearn.cluster import KMeans\n", "\n", "transformers.__version__" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
text
0翻訳タスクにおいて、Seq2seq(RNNベースEncoder-Decoderモデル)よりも...
1RNNもCNNも使わずに Attentionのみを使用したEncoder-Decoderモデ...
\n", "
" ], "text/plain": [ " text\n", "0 翻訳タスクにおいて、Seq2seq(RNNベースEncoder-Decoderモデル)よりも...\n", "1 RNNもCNNも使わずに Attentionのみを使用したEncoder-Decoderモデ..." ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# dummy data\n", "data = {\n", " 'text': [\n", " '翻訳タスクにおいて、Seq2seq(RNNベースEncoder-Decoderモデル)よりも早くて精度が高い。',\n", " 'RNNもCNNも使わずに Attentionのみを使用したEncoder-Decoderモデルで計算量も精度も改善した。'\n", " ]\n", "}\n", "df = pd.DataFrame(data)\n", "df" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# 前処理\n", "# def cleaning(x):\n", "# return x.replace('\\u3000', '').replace('■', '').replace(' ', '').replace('\\n', '').replace(' ', '').replace('【', '').replace('】', '')\n", "# df['message'] = df['massage'].aaply(lambda x: cleaning(x))" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# BERT\n", "class BertSequenceVectorizer:\n", " def __init__(self):\n", " self.device = 'cuda' if torch.cuda.is_available() else 'cpu'\n", " self.model_name = 'cl-tohoku/bert-base-japanese'\n", " self.tokenizer = BertTokenizer.from_pretrained(self.model_name)\n", " self.bert_model = transformers.BertModel.from_pretrained(self.model_name)\n", " self.bert_model = self.bert_model.to(self.device)\n", " self.max_len = 128\n", "\n", " def vectorize(self, sentence : str) -> np.array:\n", " inp = self.tokenizer.encode(sentence)\n", " len_inp = len(inp)\n", "\n", " if len_inp >= self.max_len:\n", " inputs = inp[:self.max_len]\n", " masks = [1] * self.max_len\n", " else:\n", " inputs = inp + [0] * (self.max_len - len_inp)\n", " masks = [1] * len_inp + [0] * (self.max_len - len_inp)\n", "\n", " inputs_tensor = torch.tensor([inputs], dtype=torch.long).to(self.device)\n", " masks_tensor = torch.tensor([masks], dtype=torch.long).to(self.device)\n", "\n", " bert_out = self.bert_model(inputs_tensor, masks_tensor)\n", " seq_out, pooled_out = bert_out['last_hidden_state'], bert_out['pooler_output']\n", "\n", " if torch.cuda.is_available():\n", " return seq_out[0][0].cpu().detach().numpy() # 0番目は [CLS] token, 768 dim の文章特徴量\n", " else:\n", " return seq_out[0][0].detach().numpy()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "def to_bert_feature(col:str)->pd.DataFrame:\n", " BSV = BertSequenceVectorizer()\n", " feature_df = pd.DataFrame()\n", " feature_df['description'] = df[col]\n", " feature_df['description_feature'] = df[col].apply(lambda x : BSV.vectorize(x))\n", " return feature_df" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. \n", "The tokenizer class you load from this checkpoint is 'BertJapaneseTokenizer'. \n", "The class this function is called from is 'BertTokenizer'.\n", "Some weights of the model checkpoint at cl-tohoku/bert-base-japanese were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']\n", "- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
descriptiondescription_feature
0翻訳タスクにおいて、Seq2seq(RNNベースEncoder-Decoderモデル)よりも...[0.07872914, -0.04343322, -0.26853547, -0.7479...
1RNNもCNNも使わずに Attentionのみを使用したEncoder-Decoderモデ...[0.037794013, 0.14948744, -0.37833312, -0.1988...
\n", "
" ], "text/plain": [ " description \\\n", "0 翻訳タスクにおいて、Seq2seq(RNNベースEncoder-Decoderモデル)よりも... \n", "1 RNNもCNNも使わずに Attentionのみを使用したEncoder-Decoderモデ... \n", "\n", " description_feature \n", "0 [0.07872914, -0.04343322, -0.26853547, -0.7479... \n", "1 [0.037794013, 0.14948744, -0.37833312, -0.1988... " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 実行\n", "feature_df = to_bert_feature(col='text')\n", "feature_df" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# 768 のベクトルのリストを行列に変換\n", "def to_matrix(feature:pd.DataFrame)->pd.DataFrame:\n", " out_df = pd.DataFrame()\n", " for feature in feature['description_feature']:\n", " df_ = pd.DataFrame(feature)\n", " out_df = pd.concat([out_df, df_], axis=1)\n", " out_df = out_df.T\n", " out_df.index = range(len(out_df))\n", " return out_df" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123456789...758759760761762763764765766767
00.078729-0.043433-0.268535-0.7479980.2179140.291106-0.1069790.034716-0.213352-0.271663...-0.1207240.2806710.6966980.2242480.154533-0.2333680.141323-0.0769300.0150570.416093
10.0377940.149487-0.378333-0.1988800.3043340.3026270.075751-0.335357-0.009018-0.043352...0.359600-0.1700980.2526570.1505150.506875-0.5115510.392957-0.2869560.2417660.346306
\n", "

2 rows × 768 columns

\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 \\\n", "0 0.078729 -0.043433 -0.268535 -0.747998 0.217914 0.291106 -0.106979 \n", "1 0.037794 0.149487 -0.378333 -0.198880 0.304334 0.302627 0.075751 \n", "\n", " 7 8 9 ... 758 759 760 761 \\\n", "0 0.034716 -0.213352 -0.271663 ... -0.120724 0.280671 0.696698 0.224248 \n", "1 -0.335357 -0.009018 -0.043352 ... 0.359600 -0.170098 0.252657 0.150515 \n", "\n", " 762 763 764 765 766 767 \n", "0 0.154533 -0.233368 0.141323 -0.076930 0.015057 0.416093 \n", "1 0.506875 -0.511551 0.392957 -0.286956 0.241766 0.346306 \n", "\n", "[2 rows x 768 columns]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "feature = to_matrix(feature_df)\n", "feature" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3.8.13 ('base')", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.13" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "195d00c3bc2576aa3aa8d34b1ef69c319bc4c5e1d04057dba8a69b2c34c3aaa0" } } }, "nbformat": 4, "nbformat_minor": 2 }