import torch
from Korpora import Korpora
import pandas as pd
fineturning할 nsmc 데이터를 가져옵니다.
NSMC = Korpora.load('nsmc')
Korpora 는 다른 분들이 연구 목적으로 공유해주신 말뭉치들을 손쉽게 다운로드, 사용할 수 있는 기능만을 제공합니다. 말뭉치들을 공유해 주신 분들에게 감사드리며, 각 말뭉치 별 설명과 라이센스를 공유 드립니다. 해당 말뭉치에 대해 자세히 알고 싶으신 분은 아래의 description 을 참고, 해당 말뭉치를 연구/상용의 목적으로 이용하실 때에는 아래의 라이센스를 참고해 주시기 바랍니다. # Description Author : e9t@github Repository : https://github.com/e9t/nsmc References : www.lucypark.kr/docs/2015-pyconkr/#39 Naver sentiment movie corpus v1.0 This is a movie review dataset in the Korean language. Reviews were scraped from Naver Movies. The dataset construction is based on the method noted in [Large movie review dataset][^1] from Maas et al., 2011. [^1]: http://ai.stanford.edu/~amaas/data/sentiment/ # License CC0 1.0 Universal (CC0 1.0) Public Domain Dedication Details in https://creativecommons.org/publicdomain/zero/1.0/ [Korpora] Corpus `nsmc` is already installed at C:\Users\jun\Korpora\nsmc\ratings_train.txt [Korpora] Corpus `nsmc` is already installed at C:\Users\jun\Korpora\nsmc\ratings_test.txt
dataframe 에 넣어봅니다.
train_data = pd.DataFrame({"texts":NSMC.train.texts, "labels":NSMC.train.labels})
test_data = pd.DataFrame({"texts":NSMC.test.texts, "labels":NSMC.test.labels})
train_data
texts | labels | |
---|---|---|
0 | 아 더빙.. 진짜 짜증나네요 목소리 | 0 |
1 | 흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나 | 1 |
2 | 너무재밓었다그래서보는것을추천한다 | 0 |
3 | 교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정 | 0 |
4 | 사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ... | 1 |
... | ... | ... |
149995 | 인간이 문제지.. 소는 뭔죄인가.. | 0 |
149996 | 평점이 너무 낮아서... | 1 |
149997 | 이게 뭐요? 한국인은 거들먹거리고 필리핀 혼혈은 착하다? | 0 |
149998 | 청춘 영화의 최고봉.방황과 우울했던 날들의 자화상 | 1 |
149999 | 한국 영화 최초로 수간하는 내용이 담긴 영화 | 0 |
150000 rows × 2 columns
test_data
texts | labels | |
---|---|---|
0 | 굳 ㅋ | 1 |
1 | GDNTOPCLASSINTHECLUB | 0 |
2 | 뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아 | 0 |
3 | 지루하지는 않은데 완전 막장임... 돈주고 보기에는.... | 0 |
4 | 3D만 아니었어도 별 다섯 개 줬을텐데.. 왜 3D로 나와서 제 심기를 불편하게 하죠?? | 0 |
... | ... | ... |
49995 | 오랜만에 평점 로긴했네ㅋㅋ 킹왕짱 쌈뽕한 영화를 만났습니다 강렬하게 육쾌함 | 1 |
49996 | 의지 박약들이나 하는거다 탈영은 일단 주인공 김대희 닮았고 이등병 찐따 OOOO | 0 |
49997 | 그림도 좋고 완성도도 높았지만... 보는 내내 불안하게 만든다 | 0 |
49998 | 절대 봐서는 안 될 영화.. 재미도 없고 기분만 잡치고.. 한 세트장에서 다 해먹네 | 0 |
49999 | 마무리는 또 왜이래 | 0 |
50000 rows × 2 columns
max(len(l) for l in train_data['texts'])
158
max(len(l) for l in test_data['texts'])
152
여기에서 train/test 데이터가 너무 많아서 학습이 오래걸려 1/10 으로 줄여서 진행합니다. (이 코드는 샘플이므로)
train_data = train_data.head(int(len(train_data)/10))
test_data = test_data.head(int(len(test_data)/10))
학습에 사용될 pre-trained 된 BERT 모델을 가져와서 토큰화 하기
pretrained_model_name="beomi/kcbert-base"
from transformers import AutoTokenizer
# 경고가 뜬다면 다음 명령으로 설치해주자 !pip install ipywidgets
tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name
)
tokenized_train_sentences = tokenizer(
list(train_data.texts),
return_tensors="pt",
padding=True,
truncation=True,
)
tokenized_test_sentences = tokenizer(
list(test_data.texts),
return_tensors="pt",
padding=True,
truncation=True,
)
출력해봅니다.
print(tokenized_train_sentences.keys())
print(tokenized_train_sentences['input_ids'])
print(tokenized_train_sentences['attention_mask'])
print(tokenized_train_sentences['token_type_ids'])
dict_keys(['input_ids', 'token_type_ids', 'attention_mask']) tensor([[ 2, 2170, 832, ..., 0, 0, 0], [ 2, 3521, 17, ..., 0, 0, 0], [ 2, 8069, 4089, ..., 0, 0, 0], ..., [ 2, 43, 17697, ..., 0, 0, 0], [ 2, 2477, 4116, ..., 0, 0, 0], [ 2, 2170, 4565, ..., 0, 0, 0]]) tensor([[1, 1, 1, ..., 0, 0, 0], [1, 1, 1, ..., 0, 0, 0], [1, 1, 1, ..., 0, 0, 0], ..., [1, 1, 1, ..., 0, 0, 0], [1, 1, 1, ..., 0, 0, 0], [1, 1, 1, ..., 0, 0, 0]]) tensor([[0, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0], ..., [0, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0]])
train_label = train_data['labels'].values
test_label = test_data['labels'].values
데이터 로더 준비, 이게 필요한 이유는 배치 처리하는 내부에서 원소를 액세스 하기 위함
class DataloaderDataset(torch.utils.data.Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
item['labels'] = torch.tensor(self.labels[idx])
return item
def __len__(self):
return len(self.labels)
train_dataset = DataloaderDataset(tokenized_train_sentences, train_label)
test_dataset = DataloaderDataset(tokenized_test_sentences, test_label)
from transformers import BertConfig, AutoModelForSequenceClassification, Trainer, TrainingArguments
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)
cuda:0
pretrained_model_config = BertConfig.from_pretrained(
pretrained_model_name,
)
model = AutoModelForSequenceClassification.from_pretrained(
pretrained_model_name,
config=pretrained_model_config,
)
Some weights of the model checkpoint at beomi/kcbert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias'] - This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). - This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). Some weights of BertForSequenceClassification were not initialized from the model checkpoint at beomi/kcbert-base and are newly initialized: ['classifier.weight', 'classifier.bias'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
pretrained_model_config
BertConfig { "_name_or_path": "beomi/kcbert-base", "architectures": [ "BertForMaskedLM" ], "attention_probs_dropout_prob": 0.1, "classifier_dropout": null, "directionality": "bidi", "gradient_checkpointing": false, "hidden_act": "gelu", "hidden_dropout_prob": 0.1, "hidden_size": 768, "initializer_range": 0.02, "intermediate_size": 3072, "layer_norm_eps": 1e-12, "max_position_embeddings": 300, "model_type": "bert", "num_attention_heads": 12, "num_hidden_layers": 12, "pad_token_id": 0, "pooler_fc_size": 768, "pooler_num_attention_heads": 12, "pooler_num_fc_layers": 3, "pooler_size_per_head": 128, "pooler_type": "first_token_transform", "position_embedding_type": "absolute", "transformers_version": "4.10.0", "type_vocab_size": 2, "use_cache": true, "vocab_size": 30000 }
#!pip install evaluate
#!pip install scikit-learn
import numpy as np
import evaluate
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
return metric.compute(predictions=predictions, references=labels)
training_args = TrainingArguments(
output_dir='./results', # output directory
num_train_epochs=1, # total number of training epochs
#per_device_train_batch_size=32, # batch size per device during training
#per_device_eval_batch_size=64, # batch size for evaluation
per_device_train_batch_size=16, # batch size per device during training
per_device_eval_batch_size=16, # batch size for evaluation
warmup_steps=500, # number of warmup steps for learning rate scheduler
weight_decay=0.01, # strength of weight decay
logging_dir='./logs', # directory for storing logs
logging_steps=100,
save_steps=200,
save_total_limit=2,
save_on_each_node=True,
do_train=True, # Perform training
do_eval=True, # Perform evaluation
evaluation_strategy="epoch",
seed=3
)
PyTorch: setting up devices The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
좀 더 많은 인자는 아래 링크에서 확인
trainer = Trainer(
model=model, # the instantiated 🤗 Transformers model to be trained
args=training_args, # training arguments, defined above
train_dataset=train_dataset, # training dataset
eval_dataset=test_dataset,
compute_metrics=compute_metrics,
)
trainer.train()
***** Running training ***** Num examples = 15000 Num Epochs = 1 Instantaneous batch size per device = 16 Total train batch size (w. parallel, distributed & accumulation) = 16 Gradient Accumulation steps = 1 Total optimization steps = 938 C:\Users\jun\AppData\Local\Temp\ipykernel_27736\1263192275.py:7: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor). item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
[938/938 04:42, Epoch 1/1]
Epoch | Training Loss | Validation Loss | Accuracy |
---|---|---|---|
1 | 0.344700 | 0.314100 | 0.867800 |
Saving model checkpoint to ./results\checkpoint-200 Configuration saved in ./results\checkpoint-200\config.json Model weights saved in ./results\checkpoint-200\pytorch_model.bin C:\Users\jun\AppData\Local\Temp\ipykernel_27736\1263192275.py:7: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor). item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} Saving model checkpoint to ./results\checkpoint-400 Configuration saved in ./results\checkpoint-400\config.json Model weights saved in ./results\checkpoint-400\pytorch_model.bin Deleting older checkpoint [results\checkpoint-500] due to args.save_total_limit C:\Users\jun\AppData\Local\Temp\ipykernel_27736\1263192275.py:7: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor). item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} Saving model checkpoint to ./results\checkpoint-600 Configuration saved in ./results\checkpoint-600\config.json Model weights saved in ./results\checkpoint-600\pytorch_model.bin Deleting older checkpoint [results\checkpoint-200] due to args.save_total_limit C:\Users\jun\AppData\Local\Temp\ipykernel_27736\1263192275.py:7: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor). item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} Saving model checkpoint to ./results\checkpoint-800 Configuration saved in ./results\checkpoint-800\config.json Model weights saved in ./results\checkpoint-800\pytorch_model.bin Deleting older checkpoint [results\checkpoint-400] due to args.save_total_limit C:\Users\jun\AppData\Local\Temp\ipykernel_27736\1263192275.py:7: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor). item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} ***** Running Evaluation ***** Num examples = 5000 Batch size = 16 Training completed. Do not forget to share your model on huggingface.co/models =)
TrainOutput(global_step=938, training_loss=0.3618158074075988, metrics={'train_runtime': 283.118, 'train_samples_per_second': 52.981, 'train_steps_per_second': 3.313, 'total_flos': 824791491900000.0, 'train_loss': 0.3618158074075988, 'epoch': 1.0})
trainer.save_model("trained_model")
Saving model checkpoint to trained_model Configuration saved in trained_model\config.json Model weights saved in trained_model\pytorch_model.bin
댓글 없음:
댓글 쓰기