11.13.缺失值处理
# -*- coding: utf-8 -*-
"""
Created on Mon Jun 29 10:13:47 2020
@author: two
"""
import numpy as np
from toolkit import H
from sklearn.impute import SimpleImputer, MissingIndicator, KNNImputer
SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
H.get_param(MissingIndicator)
X = np.array([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]])
imp_mean.fit(X)
print(imp_mean.transform(X))
(missing_values=nan, features='missing-only', sparse='auto',
error_on_new=True)
[[ 7. 2. 3. ]
[ 4. 3.5 6. ]
[10. 5. 9. ]]
MissingIndicator
X1 = np.array([[np.nan, 1, 3],
[4, 0, np.nan],
[8, 1, 0]])
X2 = np.array([[5, 1, np.nan],
[np.nan, 2, 3],
[2, 4, 0]])
indicator = MissingIndicator(features="all")
indicator.fit(X1)
X2_tr = indicator.transform(X2)
X2_tr
array([[False, False, True],
[ True, False, False],
[False, False, False]])
KNNImputer
X = np.array([[1, 2, np.nan], [3, 4, 3], [np.nan, 6, 5], [8, 8, 7]])
H.get_param(KNNImputer)
imputer = KNNImputer(n_neighbors=2)
imputer.fit_transform(X)
(missing_values=nan, n_neighbors=5, weights='uniform',
metric='nan_euclidean', copy=True, add_indicator=False)
array([[1. , 2. , 4. ],
[3. , 4. , 3. ],
[5.5, 6. , 5. ],
[8. , 8. , 7. ]])