import pandas as pd
df = pd.read_csv("Iris.csv")
df.head(2)
SepalLengthCm | SepalWidthCm | PetalLengthCm | PetalWidthCm | Species | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | Iris-setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | Iris-setosa |
df.Species.unique()
array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)
만약 train data에 없던 data가 test data에만 존재하고 이것을 one-hot ecoding이 필요한 경우 어떻게 처리해야할까?¶
데이터를 준비해보겠습니다.
train=df[df["Species"]!="Iris-setosa"].reset_index(drop=True)
train.Species.unique()
array(['Iris-versicolor', 'Iris-virginica'], dtype=object)
test=df[df["Species"]=="Iris-setosa"].reset_index(drop=True)
test.Species.unique()
array(['Iris-setosa'], dtype=object)
pd.get_dummies() 는 현재 주어진 데이터만 이용해서 one-hot encoding을 사용하므로 사용할 수 없는 상황
1. test와 train데이터를 합친 후 one-hot encoding을 진행한 후 다시 분리하는 방법¶
len(train)
100
len(test)
50
catd=pd.concat([train,test])
catd
SepalLengthCm | SepalWidthCm | PetalLengthCm | PetalWidthCm | Species | |
---|---|---|---|---|---|
0 | 7.0 | 3.2 | 4.7 | 1.4 | Iris-versicolor |
1 | 6.4 | 3.2 | 4.5 | 1.5 | Iris-versicolor |
2 | 6.9 | 3.1 | 4.9 | 1.5 | Iris-versicolor |
3 | 5.5 | 2.3 | 4.0 | 1.3 | Iris-versicolor |
4 | 6.5 | 2.8 | 4.6 | 1.5 | Iris-versicolor |
... | ... | ... | ... | ... | ... |
45 | 4.8 | 3.0 | 1.4 | 0.3 | Iris-setosa |
46 | 5.1 | 3.8 | 1.6 | 0.2 | Iris-setosa |
47 | 4.6 | 3.2 | 1.4 | 0.2 | Iris-setosa |
48 | 5.3 | 3.7 | 1.5 | 0.2 | Iris-setosa |
49 | 5.0 | 3.3 | 1.4 | 0.2 | Iris-setosa |
150 rows × 5 columns
catd[["Species"]]
Species | |
---|---|
0 | Iris-versicolor |
1 | Iris-versicolor |
2 | Iris-versicolor |
3 | Iris-versicolor |
4 | Iris-versicolor |
... | ... |
45 | Iris-setosa |
46 | Iris-setosa |
47 | Iris-setosa |
48 | Iris-setosa |
49 | Iris-setosa |
150 rows × 1 columns
dummy=pd.get_dummies(catd[["Species"]])
dummy
Species_Iris-setosa | Species_Iris-versicolor | Species_Iris-virginica | |
---|---|---|---|
0 | 0 | 1 | 0 |
1 | 0 | 1 | 0 |
2 | 0 | 1 | 0 |
3 | 0 | 1 | 0 |
4 | 0 | 1 | 0 |
... | ... | ... | ... |
45 | 1 | 0 | 0 |
46 | 1 | 0 | 0 |
47 | 1 | 0 | 0 |
48 | 1 | 0 | 0 |
49 | 1 | 0 | 0 |
150 rows × 3 columns
cat2=pd.concat([catd,dummy],axis=1) # axis=1을 넣어서 우측에 붙인다.
cat2
SepalLengthCm | SepalWidthCm | PetalLengthCm | PetalWidthCm | Species | Species_Iris-setosa | Species_Iris-versicolor | Species_Iris-virginica | |
---|---|---|---|---|---|---|---|---|
0 | 7.0 | 3.2 | 4.7 | 1.4 | Iris-versicolor | 0 | 1 | 0 |
1 | 6.4 | 3.2 | 4.5 | 1.5 | Iris-versicolor | 0 | 1 | 0 |
2 | 6.9 | 3.1 | 4.9 | 1.5 | Iris-versicolor | 0 | 1 | 0 |
3 | 5.5 | 2.3 | 4.0 | 1.3 | Iris-versicolor | 0 | 1 | 0 |
4 | 6.5 | 2.8 | 4.6 | 1.5 | Iris-versicolor | 0 | 1 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
45 | 4.8 | 3.0 | 1.4 | 0.3 | Iris-setosa | 1 | 0 | 0 |
46 | 5.1 | 3.8 | 1.6 | 0.2 | Iris-setosa | 1 | 0 | 0 |
47 | 4.6 | 3.2 | 1.4 | 0.2 | Iris-setosa | 1 | 0 | 0 |
48 | 5.3 | 3.7 | 1.5 | 0.2 | Iris-setosa | 1 | 0 | 0 |
49 | 5.0 | 3.3 | 1.4 | 0.2 | Iris-setosa | 1 | 0 | 0 |
150 rows × 8 columns
# 여기에서 다시 분리함
train2=cat2[cat2["Species"]!="Iris-setosa"]
train2.head()
SepalLengthCm | SepalWidthCm | PetalLengthCm | PetalWidthCm | Species | Species_Iris-setosa | Species_Iris-versicolor | Species_Iris-virginica | |
---|---|---|---|---|---|---|---|---|
0 | 7.0 | 3.2 | 4.7 | 1.4 | Iris-versicolor | 0 | 1 | 0 |
1 | 6.4 | 3.2 | 4.5 | 1.5 | Iris-versicolor | 0 | 1 | 0 |
2 | 6.9 | 3.1 | 4.9 | 1.5 | Iris-versicolor | 0 | 1 | 0 |
3 | 5.5 | 2.3 | 4.0 | 1.3 | Iris-versicolor | 0 | 1 | 0 |
4 | 6.5 | 2.8 | 4.6 | 1.5 | Iris-versicolor | 0 | 1 | 0 |
test2=cat2[cat2["Species"]=="Iris-setosa"]
test2.head()
SepalLengthCm | SepalWidthCm | PetalLengthCm | PetalWidthCm | Species | Species_Iris-setosa | Species_Iris-versicolor | Species_Iris-virginica | |
---|---|---|---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | Iris-setosa | 1 | 0 | 0 |
1 | 4.9 | 3.0 | 1.4 | 0.2 | Iris-setosa | 1 | 0 | 0 |
2 | 4.7 | 3.2 | 1.3 | 0.2 | Iris-setosa | 1 | 0 | 0 |
3 | 4.6 | 3.1 | 1.5 | 0.2 | Iris-setosa | 1 | 0 | 0 |
4 | 5.0 | 3.6 | 1.4 | 0.2 | Iris-setosa | 1 | 0 | 0 |
2. agg를 이용함¶
listdf=list(df.Species.unique())
listdf
['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
listdf.sort(reverse=True) # 여기에서는 임의로 변경해봤습니다.
listdf
['Iris-virginica', 'Iris-versicolor', 'Iris-setosa']
df2=df.copy()
df2
SepalLengthCm | SepalWidthCm | PetalLengthCm | PetalWidthCm | Species | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | Iris-setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | Iris-setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | Iris-setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | Iris-setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | Iris-setosa |
... | ... | ... | ... | ... | ... |
145 | 6.7 | 3.0 | 5.2 | 2.3 | Iris-virginica |
146 | 6.3 | 2.5 | 5.0 | 1.9 | Iris-virginica |
147 | 6.5 | 3.0 | 5.2 | 2.0 | Iris-virginica |
148 | 6.2 | 3.4 | 5.4 | 2.3 | Iris-virginica |
149 | 5.9 | 3.0 | 5.1 | 1.8 | Iris-virginica |
150 rows × 5 columns
# 함수의 리턴값은 한번에 하나씩 나오게 되므로 이러한 함수의 여러개 묶기 위해서 agg함수를 이용함
def fun0(value):
if value=="Iris-virginica":
return 1
return 0
def fun1(value):
if value=="Iris-versicolor":
return 1
return 0
def fun2(value):
if value=="Iris-setosa":
return 1
return 0
train
SepalLengthCm | SepalWidthCm | PetalLengthCm | PetalWidthCm | Species | |
---|---|---|---|---|---|
0 | 7.0 | 3.2 | 4.7 | 1.4 | Iris-versicolor |
1 | 6.4 | 3.2 | 4.5 | 1.5 | Iris-versicolor |
2 | 6.9 | 3.1 | 4.9 | 1.5 | Iris-versicolor |
3 | 5.5 | 2.3 | 4.0 | 1.3 | Iris-versicolor |
4 | 6.5 | 2.8 | 4.6 | 1.5 | Iris-versicolor |
... | ... | ... | ... | ... | ... |
95 | 6.7 | 3.0 | 5.2 | 2.3 | Iris-virginica |
96 | 6.3 | 2.5 | 5.0 | 1.9 | Iris-virginica |
97 | 6.5 | 3.0 | 5.2 | 2.0 | Iris-virginica |
98 | 6.2 | 3.4 | 5.4 | 2.3 | Iris-virginica |
99 | 5.9 | 3.0 | 5.1 | 1.8 | Iris-virginica |
100 rows × 5 columns
aggdata=train["Species"].agg([fun0,fun1,fun2])
aggdata.head()
fun0 | fun1 | fun2 | |
---|---|---|---|
0 | 0 | 1 | 0 |
1 | 0 | 1 | 0 |
2 | 0 | 1 | 0 |
3 | 0 | 1 | 0 |
4 | 0 | 1 | 0 |
aggtrain=pd.concat([train,aggdata],axis=1)
aggtrain
SepalLengthCm | SepalWidthCm | PetalLengthCm | PetalWidthCm | Species | fun0 | fun1 | fun2 | |
---|---|---|---|---|---|---|---|---|
0 | 7.0 | 3.2 | 4.7 | 1.4 | Iris-versicolor | 0 | 1 | 0 |
1 | 6.4 | 3.2 | 4.5 | 1.5 | Iris-versicolor | 0 | 1 | 0 |
2 | 6.9 | 3.1 | 4.9 | 1.5 | Iris-versicolor | 0 | 1 | 0 |
3 | 5.5 | 2.3 | 4.0 | 1.3 | Iris-versicolor | 0 | 1 | 0 |
4 | 6.5 | 2.8 | 4.6 | 1.5 | Iris-versicolor | 0 | 1 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
95 | 6.7 | 3.0 | 5.2 | 2.3 | Iris-virginica | 1 | 0 | 0 |
96 | 6.3 | 2.5 | 5.0 | 1.9 | Iris-virginica | 1 | 0 | 0 |
97 | 6.5 | 3.0 | 5.2 | 2.0 | Iris-virginica | 1 | 0 | 0 |
98 | 6.2 | 3.4 | 5.4 | 2.3 | Iris-virginica | 1 | 0 | 0 |
99 | 5.9 | 3.0 | 5.1 | 1.8 | Iris-virginica | 1 | 0 | 0 |
100 rows × 8 columns
aggdata=test["Species"].agg([fun0,fun1,fun2])
aggdata.head()
fun0 | fun1 | fun2 | |
---|---|---|---|
0 | 0 | 0 | 1 |
1 | 0 | 0 | 1 |
2 | 0 | 0 | 1 |
3 | 0 | 0 | 1 |
4 | 0 | 0 | 1 |
aggtest=pd.concat([test,aggdata],axis=1)
aggtest.head()
SepalLengthCm | SepalWidthCm | PetalLengthCm | PetalWidthCm | Species | fun0 | fun1 | fun2 | |
---|---|---|---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | Iris-setosa | 0 | 0 | 1 |
1 | 4.9 | 3.0 | 1.4 | 0.2 | Iris-setosa | 0 | 0 | 1 |
2 | 4.7 | 3.2 | 1.3 | 0.2 | Iris-setosa | 0 | 0 | 1 |
3 | 4.6 | 3.1 | 1.5 | 0.2 | Iris-setosa | 0 | 0 | 1 |
4 | 5.0 | 3.6 | 1.4 | 0.2 | Iris-setosa | 0 | 0 | 1 |
3. sklearn 의 one-hot encoding을 이용함¶
from sklearn.preprocessing import OneHotEncoder
encoder=OneHotEncoder(sparse=False).fit(X=train[["Species"]])
encoder.transform(train[["Species"]])
array([[1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [0., 1.], [0., 1.], [0., 1.], [0., 1.], [0., 1.], [0., 1.], [0., 1.], [0., 1.], [0., 1.], [0., 1.], [0., 1.], [0., 1.], [0., 1.], [0., 1.], [0., 1.], [0., 1.], [0., 1.], [0., 1.], [0., 1.], [0., 1.], [0., 1.], [0., 1.], [0., 1.], [0., 1.], [0., 1.], [0., 1.], [0., 1.], [0., 1.], [0., 1.], [0., 1.], [0., 1.], [0., 1.], [0., 1.], [0., 1.], [0., 1.], [0., 1.], [0., 1.], [0., 1.], [0., 1.], [0., 1.], [0., 1.], [0., 1.], [0., 1.], [0., 1.], [0., 1.], [0., 1.], [0., 1.], [0., 1.], [0., 1.], [0., 1.]])
encoder.transform(test[["Species"]]) # test set에 존재하지 않으면 오류가 발생한다
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) Cell In [178], line 1 ----> 1 encoder.transform(test[["Species"]]) # test set에 존재하지 않으면 오류가 발생한다 File ~\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\preprocessing\_encoders.py:882, in OneHotEncoder.transform(self, X) 877 # validation of X happens in _check_X called by _transform 878 warn_on_unknown = self.drop is not None and self.handle_unknown in { 879 "ignore", 880 "infrequent_if_exist", 881 } --> 882 X_int, X_mask = self._transform( 883 X, 884 handle_unknown=self.handle_unknown, 885 force_all_finite="allow-nan", 886 warn_on_unknown=warn_on_unknown, 887 ) 888 self._map_infrequent_categories(X_int, X_mask) 890 n_samples, n_features = X_int.shape File ~\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\preprocessing\_encoders.py:160, in _BaseEncoder._transform(self, X, handle_unknown, force_all_finite, warn_on_unknown) 155 if handle_unknown == "error": 156 msg = ( 157 "Found unknown categories {0} in column {1}" 158 " during transform".format(diff, i) 159 ) --> 160 raise ValueError(msg) 161 else: 162 if warn_on_unknown: ValueError: Found unknown categories ['Iris-setosa'] in column 0 during transform
# handle_unknown="ignore" 을 넣으면 그냥 변환 처리를 하지 않는다.
encoder=OneHotEncoder(sparse=False,handle_unknown="ignore").fit(X=train[["Species"]])
encoder.transform(test[["Species"]])
array([[0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.]])
# handle_unknown="infrequent_if_exist" 을 넣으면 `max_categories` or `min_frequency` 인자도 넣어줘야한다
# 자세한 사용법은 문서 참고
# 여기에서는 극단적으로 처리되긴 하였지만 test set에 train시 사용했던 범주가 있을 수도 있고 없을 수 도 있기 때문에 ignore는 유용하다
array를 dataframe으로 변환 하는 방법¶
feature_name=encoder.get_feature_names_out()
feature_name
array(['Species_Iris-versicolor', 'Species_Iris-virginica'], dtype=object)
train_encdata=encoder.transform(train[["Species"]])
i = 0
for feature_name_one in feature_name:
train[feature_name_one] = train_encdata[:,i]
train[feature_name_one]=train[feature_name_one].astype(int)
i = i + 1
train.head()
SepalLengthCm | SepalWidthCm | PetalLengthCm | PetalWidthCm | Species | Species_Iris-versicolor | Species_Iris-virginica | |
---|---|---|---|---|---|---|---|
0 | 7.0 | 3.2 | 4.7 | 1.4 | Iris-versicolor | 1 | 0 |
1 | 6.4 | 3.2 | 4.5 | 1.5 | Iris-versicolor | 1 | 0 |
2 | 6.9 | 3.1 | 4.9 | 1.5 | Iris-versicolor | 1 | 0 |
3 | 5.5 | 2.3 | 4.0 | 1.3 | Iris-versicolor | 1 | 0 |
4 | 6.5 | 2.8 | 4.6 | 1.5 | Iris-versicolor | 1 | 0 |
test_encdata=encoder.transform(test[["Species"]])
i = 0
for feature_name_one in feature_name:
test[feature_name_one] = test_encdata[:,i]
test[feature_name_one]=test[feature_name_one].astype(int)
i = i + 1
test.head()
SepalLengthCm | SepalWidthCm | PetalLengthCm | PetalWidthCm | Species | Species_Iris-versicolor | Species_Iris-virginica | |
---|---|---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | Iris-setosa | 0 | 0 |
1 | 4.9 | 3.0 | 1.4 | 0.2 | Iris-setosa | 0 | 0 |
2 | 4.7 | 3.2 | 1.3 | 0.2 | Iris-setosa | 0 | 0 |
3 | 4.6 | 3.1 | 1.5 | 0.2 | Iris-setosa | 0 | 0 |
4 | 5.0 | 3.6 | 1.4 | 0.2 | Iris-setosa | 0 | 0 |