sklearn preprocessing
来源:互联网 发布:机顶盒打开软件下载 编辑:程序博客网 时间:2024/05/21 10:43
sklearn preprocessing代码
代码来自Anaconda软件里sklearn模块
init.py
"""The :mod:`sklearn.preprocessing` module includes scaling, centering,normalization, binarization and imputation methods."""from .data import Binarizerfrom .data import KernelCentererfrom .data import MinMaxScalerfrom .data import Normalizerfrom .data import StandardScalerfrom .data import add_dummy_featurefrom .data import binarizefrom .data import normalizefrom .data import scalefrom .data import OneHotEncoderfrom .data import PolynomialFeaturesfrom .label import label_binarizefrom .label import LabelBinarizerfrom .label import LabelEncoderfrom .label import MultiLabelBinarizerfrom .imputation import Imputer__all__ = [ 'Binarizer', 'Imputer', 'KernelCenterer', 'LabelBinarizer', 'LabelEncoder', 'MultiLabelBinarizer', 'MinMaxScaler', 'Normalizer', 'OneHotEncoder', 'StandardScaler', 'add_dummy_feature', 'PolynomialFeatures', 'binarize', 'normalize', 'scale', 'label_binarize',]
_weights.py
import numpy as npfrom ..utils.fixes import bincountdef _balance_weights(y): """Compute sample weights such that the class distribution of y becomes balanced. Parameters ---------- y : array-like Labels for the samples. Returns ------- weights : array-like The sample weights. """ y = np.asarray(y) y = np.searchsorted(np.unique(y), y) bins = bincount(y) weights = 1. / bins.take(y) weights *= bins.min() return weights
data.py
# Authors: Alexandre Gramfort <alexandre.gramfort@inria.fr># Mathieu Blondel <mathieu@mblondel.org># Olivier Grisel <olivier.grisel@ensta.org># Andreas Mueller <amueller@ais.uni-bonn.de># Eric Martin <eric@ericmart.in># License: BSD 3 clausefrom itertools import chain, combinationsimport numbersimport warningsimport numpy as npfrom scipy import sparsefrom ..base import BaseEstimator, TransformerMixinfrom ..externals import sixfrom ..utils import check_arrayfrom ..utils import warn_if_not_floatfrom ..utils.extmath import row_normsfrom ..utils.fixes import (combinations_with_replacement as combinations_w_r, bincount)from ..utils.fixes import isclosefrom ..utils.sparsefuncs_fast import (inplace_csr_row_normalize_l1, inplace_csr_row_normalize_l2)from ..utils.sparsefuncs import (inplace_column_scale, mean_variance_axis)from ..utils.validation import check_is_fittedzip = six.moves.zipmap = six.moves.maprange = six.moves.range__all__ = [ 'Binarizer', 'KernelCenterer', 'MinMaxScaler', 'Normalizer', 'OneHotEncoder', 'StandardScaler', 'add_dummy_feature', 'binarize', 'normalize', 'scale',]def _mean_and_std(X, axis=0, with_mean=True, with_std=True): """Compute mean and std deviation for centering, scaling. Zero valued std components are reset to 1.0 to avoid NaNs when scaling. """ X = np.asarray(X) Xr = np.rollaxis(X, axis) if with_mean: mean_ = Xr.mean(axis=0) else: mean_ = None if with_std: std_ = Xr.std(axis=0) if isinstance(std_, np.ndarray): std_[std_ == 0.] = 1.0 elif std_ == 0.: std_ = 1. else: std_ = None return mean_, std_def scale(X, axis=0, with_mean=True, with_std=True, copy=True): """Standardize a dataset along any axis Center to the mean and component wise scale to unit variance. Parameters ---------- X : array-like or CSR matrix. The data to center and scale. axis : int (0 by default) axis used to compute the means and standard deviations along. If 0, independently standardize each feature, otherwise (if 1) standardize each sample. with_mean : boolean, True by default If True, center the data before scaling. with_std : boolean, True by default If True, scale the data to unit variance (or equivalently, unit standard deviation). copy : boolean, optional, default True set to False to perform inplace row normalization and avoid a copy (if the input is already a numpy array or a scipy.sparse CSR matrix and if axis is 1). Notes ----- This implementation will refuse to center scipy.sparse matrices since it would make them non-sparse and would potentially crash the program with memory exhaustion problems. Instead the caller is expected to either set explicitly `with_mean=False` (in that case, only variance scaling will be performed on the features of the CSR matrix) or to call `X.toarray()` if he/she expects the materialized dense array to fit in memory. To avoid memory copy the caller should pass a CSR matrix. See also -------- :class:`sklearn.preprocessing.StandardScaler` to perform centering and scaling using the ``Transformer`` API (e.g. as part of a preprocessing :class:`sklearn.pipeline.Pipeline`) """ X = check_array(X, accept_sparse='csr', copy=copy, ensure_2d=False) warn_if_not_float(X, estimator='The scale function') if sparse.issparse(X): if with_mean: raise ValueError( "Cannot center sparse matrices: pass `with_mean=False` instead" " See docstring for motivation and alternatives.") if axis != 0: raise ValueError("Can only scale sparse matrix on axis=0, " " got axis=%d" % axis) if not sparse.isspmatrix_csr(X): X = X.tocsr() copy = False if copy: X = X.copy() _, var = mean_variance_axis(X, axis=0) var[var == 0.0] = 1.0 inplace_column_scale(X, 1 / np.sqrt(var)) else: X = np.asarray(X) mean_, std_ = _mean_and_std( X, axis, with_mean=with_mean, with_std=with_std) if copy: X = X.copy() # Xr is a view on the original array that enables easy use of # broadcasting on the axis in which we are interested in Xr = np.rollaxis(X, axis) if with_mean: Xr -= mean_ mean_1 = Xr.mean(axis=0) # Verify that mean_1 is 'close to zero'. If X contains very # large values, mean_1 can also be very large, due to a lack of # precision of mean_. In this case, a pre-scaling of the # concerned feature is efficient, for instance by its mean or # maximum. if not np.allclose(mean_1, 0): warnings.warn("Numerical issues were encountered " "when centering the data " "and might not be solved. Dataset may " "contain too large values. You may need " "to prescale your features.") Xr -= mean_1 if with_std: Xr /= std_ if with_mean: mean_2 = Xr.mean(axis=0) # If mean_2 is not 'close to zero', it comes from the fact that # std_ is very small so that mean_2 = mean_1/std_ > 0, even if # mean_1 was close to zero. The problem is thus essentially due # to the lack of precision of mean_. A solution is then to # substract the mean again: if not np.allclose(mean_2, 0): warnings.warn("Numerical issues were encountered " "when scaling the data " "and might not be solved. The standard " "deviation of the data is probably " "very close to 0. ") Xr -= mean_2 return Xclass MinMaxScaler(BaseEstimator, TransformerMixin): """Standardizes features by scaling each feature to a given range. This estimator scales and translates each feature individually such that it is in the given range on the training set, i.e. between zero and one. The standardization is given by:: X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0)) X_scaled = X_std * (max - min) + min where min, max = feature_range. This standardization is often used as an alternative to zero mean, unit variance scaling. Parameters ---------- feature_range: tuple (min, max), default=(0, 1) Desired range of transformed data. copy : boolean, optional, default True Set to False to perform inplace row normalization and avoid a copy (if the input is already a numpy array). Attributes ---------- min_ : ndarray, shape (n_features,) Per feature adjustment for minimum. scale_ : ndarray, shape (n_features,) Per feature relative scaling of the data. """ def __init__(self, feature_range=(0, 1), copy=True): self.feature_range = feature_range self.copy = copy def fit(self, X, y=None): """Compute the minimum and maximum to be used for later scaling. Parameters ---------- X : array-like, shape [n_samples, n_features] The data used to compute the per-feature minimum and maximum used for later scaling along the features axis. """ X = check_array(X, copy=self.copy, ensure_2d=False) warn_if_not_float(X, estimator=self) feature_range = self.feature_range if feature_range[0] >= feature_range[1]: raise ValueError("Minimum of desired feature range must be smaller" " than maximum. Got %s." % str(feature_range)) data_min = np.min(X, axis=0) data_range = np.max(X, axis=0) - data_min # Do not scale constant features if isinstance(data_range, np.ndarray): data_range[data_range == 0.0] = 1.0 elif data_range == 0.: data_range = 1. self.scale_ = (feature_range[1] - feature_range[0]) / data_range self.min_ = feature_range[0] - data_min * self.scale_ self.data_range = data_range self.data_min = data_min return self def transform(self, X): """Scaling features of X according to feature_range. Parameters ---------- X : array-like with shape [n_samples, n_features] Input data that will be transformed. """ check_is_fitted(self, 'scale_') X = check_array(X, copy=self.copy, ensure_2d=False) X *= self.scale_ X += self.min_ return X def inverse_transform(self, X): """Undo the scaling of X according to feature_range. Parameters ---------- X : array-like with shape [n_samples, n_features] Input data that will be transformed. """ check_is_fitted(self, 'scale_') X = check_array(X, copy=self.copy, ensure_2d=False) X -= self.min_ X /= self.scale_ return Xclass StandardScaler(BaseEstimator, TransformerMixin): """Standardize features by removing the mean and scaling to unit variance Centering and scaling happen independently on each feature by computing the relevant statistics on the samples in the training set. Mean and standard deviation are then stored to be used on later data using the `transform` method. Standardization of a dataset is a common requirement for many machine learning estimators: they might behave badly if the individual feature do not more or less look like standard normally distributed data (e.g. Gaussian with 0 mean and unit variance). For instance many elements used in the objective function of a learning algorithm (such as the RBF kernel of Support Vector Machines or the L1 and L2 regularizers of linear models) assume that all features are centered around 0 and have variance in the same order. If a feature has a variance that is orders of magnitude larger that others, it might dominate the objective function and make the estimator unable to learn from other features correctly as expected. Parameters ---------- with_mean : boolean, True by default If True, center the data before scaling. This does not work (and will raise an exception) when attempted on sparse matrices, because centering them entails building a dense matrix which in common use cases is likely to be too large to fit in memory. with_std : boolean, True by default If True, scale the data to unit variance (or equivalently, unit standard deviation). copy : boolean, optional, default True If False, try to avoid a copy and do inplace scaling instead. This is not guaranteed to always work inplace; e.g. if the data is not a NumPy array or scipy.sparse CSR matrix, a copy may still be returned. Attributes ---------- mean_ : array of floats with shape [n_features] The mean value for each feature in the training set. std_ : array of floats with shape [n_features] The standard deviation for each feature in the training set. See also -------- :func:`sklearn.preprocessing.scale` to perform centering and scaling without using the ``Transformer`` object oriented API :class:`sklearn.decomposition.RandomizedPCA` with `whiten=True` to further remove the linear correlation across features. """ def __init__(self, copy=True, with_mean=True, with_std=True): self.with_mean = with_mean self.with_std = with_std self.copy = copy def fit(self, X, y=None): """Compute the mean and std to be used for later scaling. Parameters ---------- X : array-like or CSR matrix with shape [n_samples, n_features] The data used to compute the mean and standard deviation used for later scaling along the features axis. """ X = check_array(X, accept_sparse='csr', copy=self.copy, ensure_2d=False) if warn_if_not_float(X, estimator=self): X = X.astype(np.float) if sparse.issparse(X): if self.with_mean: raise ValueError( "Cannot center sparse matrices: pass `with_mean=False` " "instead. See docstring for motivation and alternatives.") self.mean_ = None if self.with_std: var = mean_variance_axis(X, axis=0)[1] self.std_ = np.sqrt(var) self.std_[var == 0.0] = 1.0 else: self.std_ = None return self else: self.mean_, self.std_ = _mean_and_std( X, axis=0, with_mean=self.with_mean, with_std=self.with_std) return self def transform(self, X, y=None, copy=None): """Perform standardization by centering and scaling Parameters ---------- X : array-like with shape [n_samples, n_features] The data used to scale along the features axis. """ check_is_fitted(self, 'std_') copy = copy if copy is not None else self.copy X = check_array(X, accept_sparse='csr', copy=copy, ensure_2d=False) if warn_if_not_float(X, estimator=self): X = X.astype(np.float) if sparse.issparse(X): if self.with_mean: raise ValueError( "Cannot center sparse matrices: pass `with_mean=False` " "instead. See docstring for motivation and alternatives.") if self.std_ is not None: inplace_column_scale(X, 1 / self.std_) else: if self.with_mean: X -= self.mean_ if self.with_std: X /= self.std_ return X def inverse_transform(self, X, copy=None): """Scale back the data to the original representation Parameters ---------- X : array-like with shape [n_samples, n_features] The data used to scale along the features axis. """ check_is_fitted(self, 'std_') copy = copy if copy is not None else self.copy if sparse.issparse(X): if self.with_mean: raise ValueError( "Cannot uncenter sparse matrices: pass `with_mean=False` " "instead See docstring for motivation and alternatives.") if not sparse.isspmatrix_csr(X): X = X.tocsr() copy = False if copy: X = X.copy() if self.std_ is not None: inplace_column_scale(X, self.std_) else: X = np.asarray(X) if copy: X = X.copy() if self.with_std: X *= self.std_ if self.with_mean: X += self.mean_ return Xclass PolynomialFeatures(BaseEstimator, TransformerMixin): """Generate polynomial and interaction features. Generate a new feature matrix consisting of all polynomial combinations of the features with degree less than or equal to the specified degree. For example, if an input sample is two dimensional and of the form [a, b], the degree-2 polynomial features are [1, a, b, a^2, ab, b^2]. Parameters ---------- degree : integer The degree of the polynomial features. Default = 2. interaction_only : boolean, default = False If true, only interaction features are produced: features that are products of at most ``degree`` *distinct* input features (so not ``x[1] ** 2``, ``x[0] * x[2] ** 3``, etc.). include_bias : boolean If True (default), then include a bias column, the feature in which all polynomial powers are zero (i.e. a column of ones - acts as an intercept term in a linear model). Examples -------- >>> X = np.arange(6).reshape(3, 2) >>> X array([[0, 1], [2, 3], [4, 5]]) >>> poly = PolynomialFeatures(2) >>> poly.fit_transform(X) array([[ 1, 0, 1, 0, 0, 1], [ 1, 2, 3, 4, 6, 9], [ 1, 4, 5, 16, 20, 25]]) >>> poly = PolynomialFeatures(interaction_only=True) >>> poly.fit_transform(X) array([[ 1, 0, 1, 0], [ 1, 2, 3, 6], [ 1, 4, 5, 20]]) Attributes ---------- powers_ : array, shape (n_input_features, n_output_features) powers_[i, j] is the exponent of the jth input in the ith output. n_input_features_ : int The total number of input features. n_output_features_ : int The total number of polynomial output features. The number of output features is computed by iterating over all suitably sized combinations of input features. Notes ----- Be aware that the number of features in the output array scales polynomially in the number of features of the input array, and exponentially in the degree. High degrees can cause overfitting. See :ref:`examples/linear_model/plot_polynomial_interpolation.py <example_linear_model_plot_polynomial_interpolation.py>` """ def __init__(self, degree=2, interaction_only=False, include_bias=True): self.degree = degree self.interaction_only = interaction_only self.include_bias = include_bias @staticmethod def _combinations(n_features, degree, interaction_only, include_bias): comb = (combinations if interaction_only else combinations_w_r) start = int(not include_bias) return chain.from_iterable(comb(range(n_features), i) for i in range(start, degree + 1)) @property def powers_(self): check_is_fitted(self, 'n_input_features_') combinations = self._combinations(self.n_input_features_, self.degree, self.interaction_only, self.include_bias) return np.vstack(np.bincount(c, minlength=self.n_input_features_) for c in combinations) def fit(self, X, y=None): """ Compute number of output features. """ n_samples, n_features = check_array(X).shape combinations = self._combinations(n_features, self.degree, self.interaction_only, self.include_bias) self.n_input_features_ = n_features self.n_output_features_ = sum(1 for _ in combinations) return self def transform(self, X, y=None): """Transform data to polynomial features Parameters ---------- X : array with shape [n_samples, n_features] The data to transform, row by row. Returns ------- XP : np.ndarray shape [n_samples, NP] The matrix of features, where NP is the number of polynomial features generated from the combination of inputs. """ check_is_fitted(self, ['n_input_features_', 'n_output_features_']) X = check_array(X) n_samples, n_features = X.shape if n_features != self.n_input_features_: raise ValueError("X shape does not match training shape") # allocate output data XP = np.empty((n_samples, self.n_output_features_), dtype=X.dtype) combinations = self._combinations(n_features, self.degree, self.interaction_only, self.include_bias) for i, c in enumerate(combinations): XP[:, i] = X[:, c].prod(1) return XPdef normalize(X, norm='l2', axis=1, copy=True): """Scale input vectors individually to unit norm (vector length). Parameters ---------- X : array or scipy.sparse matrix with shape [n_samples, n_features] The data to normalize, element by element. scipy.sparse matrices should be in CSR format to avoid an un-necessary copy. norm : 'l1' or 'l2', optional ('l2' by default) The norm to use to normalize each non zero sample (or each non-zero feature if axis is 0). axis : 0 or 1, optional (1 by default) axis used to normalize the data along. If 1, independently normalize each sample, otherwise (if 0) normalize each feature. copy : boolean, optional, default True set to False to perform inplace row normalization and avoid a copy (if the input is already a numpy array or a scipy.sparse CSR matrix and if axis is 1). See also -------- :class:`sklearn.preprocessing.Normalizer` to perform normalization using the ``Transformer`` API (e.g. as part of a preprocessing :class:`sklearn.pipeline.Pipeline`) """ if norm not in ('l1', 'l2'): raise ValueError("'%s' is not a supported norm" % norm) if axis == 0: sparse_format = 'csc' elif axis == 1: sparse_format = 'csr' else: raise ValueError("'%d' is not a supported axis" % axis) X = check_array(X, sparse_format, copy=copy) warn_if_not_float(X, 'The normalize function') if axis == 0: X = X.T if sparse.issparse(X): if norm == 'l1': inplace_csr_row_normalize_l1(X) elif norm == 'l2': inplace_csr_row_normalize_l2(X) else: if norm == 'l1': norms = np.abs(X).sum(axis=1) norms[norms == 0.0] = 1.0 elif norm == 'l2': norms = row_norms(X) norms[norms == 0.0] = 1.0 X /= norms[:, np.newaxis] if axis == 0: X = X.T return Xclass Normalizer(BaseEstimator, TransformerMixin): """Normalize samples individually to unit norm. Each sample (i.e. each row of the data matrix) with at least one non zero component is rescaled independently of other samples so that its norm (l1 or l2) equals one. This transformer is able to work both with dense numpy arrays and scipy.sparse matrix (use CSR format if you want to avoid the burden of a copy / conversion). Scaling inputs to unit norms is a common operation for text classification or clustering for instance. For instance the dot product of two l2-normalized TF-IDF vectors is the cosine similarity of the vectors and is the base similarity metric for the Vector Space Model commonly used by the Information Retrieval community. Parameters ---------- norm : 'l1' or 'l2', optional ('l2' by default) The norm to use to normalize each non zero sample. copy : boolean, optional, default True set to False to perform inplace row normalization and avoid a copy (if the input is already a numpy array or a scipy.sparse CSR matrix). Notes ----- This estimator is stateless (besides constructor parameters), the fit method does nothing but is useful when used in a pipeline. See also -------- :func:`sklearn.preprocessing.normalize` equivalent function without the object oriented API """ def __init__(self, norm='l2', copy=True): self.norm = norm self.copy = copy def fit(self, X, y=None): """Do nothing and return the estimator unchanged This method is just there to implement the usual API and hence work in pipelines. """ X = check_array(X, accept_sparse='csr') return self def transform(self, X, y=None, copy=None): """Scale each non zero row of X to unit norm Parameters ---------- X : array or scipy.sparse matrix with shape [n_samples, n_features] The data to normalize, row by row. scipy.sparse matrices should be in CSR format to avoid an un-necessary copy. """ copy = copy if copy is not None else self.copy X = check_array(X, accept_sparse='csr') return normalize(X, norm=self.norm, axis=1, copy=copy)def binarize(X, threshold=0.0, copy=True): """Boolean thresholding of array-like or scipy.sparse matrix Parameters ---------- X : array or scipy.sparse matrix with shape [n_samples, n_features] The data to binarize, element by element. scipy.sparse matrices should be in CSR or CSC format to avoid an un-necessary copy. threshold : float, optional (0.0 by default) Feature values below or equal to this are replaced by 0, above it by 1. Threshold may not be less than 0 for operations on sparse matrices. copy : boolean, optional, default True set to False to perform inplace binarization and avoid a copy (if the input is already a numpy array or a scipy.sparse CSR / CSC matrix and if axis is 1). See also -------- :class:`sklearn.preprocessing.Binarizer` to perform binarization using the ``Transformer`` API (e.g. as part of a preprocessing :class:`sklearn.pipeline.Pipeline`) """ X = check_array(X, accept_sparse=['csr', 'csc'], copy=copy) if sparse.issparse(X): if threshold < 0: raise ValueError('Cannot binarize a sparse matrix with threshold ' '< 0') cond = X.data > threshold not_cond = np.logical_not(cond) X.data[cond] = 1 X.data[not_cond] = 0 X.eliminate_zeros() else: cond = X > threshold not_cond = np.logical_not(cond) X[cond] = 1 X[not_cond] = 0 return Xclass Binarizer(BaseEstimator, TransformerMixin): """Binarize data (set feature values to 0 or 1) according to a threshold Values greater than the threshold map to 1, while values less than or equal to the threshold map to 0. With the default threshold of 0, only positive values map to 1. Binarization is a common operation on text count data where the analyst can decide to only consider the presence or absence of a feature rather than a quantified number of occurrences for instance. It can also be used as a pre-processing step for estimators that consider boolean random variables (e.g. modelled using the Bernoulli distribution in a Bayesian setting). Parameters ---------- threshold : float, optional (0.0 by default) Feature values below or equal to this are replaced by 0, above it by 1. Threshold may not be less than 0 for operations on sparse matrices. copy : boolean, optional, default True set to False to perform inplace binarization and avoid a copy (if the input is already a numpy array or a scipy.sparse CSR matrix). Notes ----- If the input is a sparse matrix, only the non-zero values are subject to update by the Binarizer class. This estimator is stateless (besides constructor parameters), the fit method does nothing but is useful when used in a pipeline. """ def __init__(self, threshold=0.0, copy=True): self.threshold = threshold self.copy = copy def fit(self, X, y=None): """Do nothing and return the estimator unchanged This method is just there to implement the usual API and hence work in pipelines. """ check_array(X, accept_sparse='csr') return self def transform(self, X, y=None, copy=None): """Binarize each element of X Parameters ---------- X : array or scipy.sparse matrix with shape [n_samples, n_features] The data to binarize, element by element. scipy.sparse matrices should be in CSR format to avoid an un-necessary copy. """ copy = copy if copy is not None else self.copy return binarize(X, threshold=self.threshold, copy=copy)class KernelCenterer(BaseEstimator, TransformerMixin): """Center a kernel matrix Let K(x, z) be a kernel defined by phi(x)^T phi(z), where phi is a function mapping x to a Hilbert space. KernelCenterer centers (i.e., normalize to have zero mean) the data without explicitly computing phi(x). It is equivalent to centering phi(x) with sklearn.preprocessing.StandardScaler(with_std=False). """ def fit(self, K, y=None): """Fit KernelCenterer Parameters ---------- K : numpy array of shape [n_samples, n_samples] Kernel matrix. Returns ------- self : returns an instance of self. """ K = check_array(K) n_samples = K.shape[0] self.K_fit_rows_ = np.sum(K, axis=0) / n_samples self.K_fit_all_ = self.K_fit_rows_.sum() / n_samples return self def transform(self, K, y=None, copy=True): """Center kernel matrix. Parameters ---------- K : numpy array of shape [n_samples1, n_samples2] Kernel matrix. copy : boolean, optional, default True Set to False to perform inplace computation. Returns ------- K_new : numpy array of shape [n_samples1, n_samples2] """ check_is_fitted(self, 'K_fit_all_') K = check_array(K) if copy: K = K.copy() K_pred_cols = (np.sum(K, axis=1) / self.K_fit_rows_.shape[0])[:, np.newaxis] K -= self.K_fit_rows_ K -= K_pred_cols K += self.K_fit_all_ return Kdef add_dummy_feature(X, value=1.0): """Augment dataset with an additional dummy feature. This is useful for fitting an intercept term with implementations which cannot otherwise fit it directly. Parameters ---------- X : array or scipy.sparse matrix with shape [n_samples, n_features] Data. value : float Value to use for the dummy feature. Returns ------- X : array or scipy.sparse matrix with shape [n_samples, n_features + 1] Same data with dummy feature added as first column. Examples -------- >>> from sklearn.preprocessing import add_dummy_feature >>> add_dummy_feature([[0, 1], [1, 0]]) array([[ 1., 0., 1.], [ 1., 1., 0.]]) """ X = check_array(X, accept_sparse=['csc', 'csr', 'coo']) n_samples, n_features = X.shape shape = (n_samples, n_features + 1) if sparse.issparse(X): if sparse.isspmatrix_coo(X): # Shift columns to the right. col = X.col + 1 # Column indices of dummy feature are 0 everywhere. col = np.concatenate((np.zeros(n_samples), col)) # Row indices of dummy feature are 0, ..., n_samples-1. row = np.concatenate((np.arange(n_samples), X.row)) # Prepend the dummy feature n_samples times. data = np.concatenate((np.ones(n_samples) * value, X.data)) return sparse.coo_matrix((data, (row, col)), shape) elif sparse.isspmatrix_csc(X): # Shift index pointers since we need to add n_samples elements. indptr = X.indptr + n_samples # indptr[0] must be 0. indptr = np.concatenate((np.array([0]), indptr)) # Row indices of dummy feature are 0, ..., n_samples-1. indices = np.concatenate((np.arange(n_samples), X.indices)) # Prepend the dummy feature n_samples times. data = np.concatenate((np.ones(n_samples) * value, X.data)) return sparse.csc_matrix((data, indices, indptr), shape) else: klass = X.__class__ return klass(add_dummy_feature(X.tocoo(), value)) else: return np.hstack((np.ones((n_samples, 1)) * value, X))def _transform_selected(X, transform, selected="all", copy=True): """Apply a transform function to portion of selected features Parameters ---------- X : array-like or sparse matrix, shape=(n_samples, n_features) Dense array or sparse matrix. transform : callable A callable transform(X) -> X_transformed copy : boolean, optional Copy X even if it could be avoided. selected: "all" or array of indices or mask Specify which features to apply the transform to. Returns ------- X : array or sparse matrix, shape=(n_samples, n_features_new) """ if selected == "all": return transform(X) X = check_array(X, accept_sparse='csc', copy=copy) if len(selected) == 0: return X n_features = X.shape[1] ind = np.arange(n_features) sel = np.zeros(n_features, dtype=bool) sel[np.asarray(selected)] = True not_sel = np.logical_not(sel) n_selected = np.sum(sel) if n_selected == 0: # No features selected. return X elif n_selected == n_features: # All features selected. return transform(X) else: X_sel = transform(X[:, ind[sel]]) X_not_sel = X[:, ind[not_sel]] if sparse.issparse(X_sel) or sparse.issparse(X_not_sel): return sparse.hstack((X_sel, X_not_sel)) else: return np.hstack((X_sel, X_not_sel))class OneHotEncoder(BaseEstimator, TransformerMixin): """Encode categorical integer features using a one-hot aka one-of-K scheme. The input to this transformer should be a matrix of integers, denoting the values taken on by categorical (discrete) features. The output will be a sparse matrix where each column corresponds to one possible value of one feature. It is assumed that input features take on values in the range [0, n_values). This encoding is needed for feeding categorical data to many scikit-learn estimators, notably linear models and SVMs with the standard kernels. Parameters ---------- n_values : 'auto', int or array of ints Number of values per feature. - 'auto' : determine value range from training data. - int : maximum value for all features. - array : maximum value per feature. categorical_features: "all" or array of indices or mask Specify what features are treated as categorical. - 'all' (default): All features are treated as categorical. - array of indices: Array of categorical feature indices. - mask: Array of length n_features and with dtype=bool. Non-categorical features are always stacked to the right of the matrix. dtype : number type, default=np.float Desired dtype of output. sparse : boolean, default=True Will return sparse matrix if set True else will return an array. handle_unknown : str, 'error' or 'ignore' Whether to raise an error or ignore if a unknown categorical feature is present during transform. Attributes ---------- active_features_ : array Indices for active features, meaning values that actually occur in the training set. Only available when n_values is ``'auto'``. feature_indices_ : array of shape (n_features,) Indices to feature ranges. Feature ``i`` in the original data is mapped to features from ``feature_indices_[i]`` to ``feature_indices_[i+1]`` (and then potentially masked by `active_features_` afterwards) n_values_ : array of shape (n_features,) Maximum number of values per feature. Examples -------- Given a dataset with three features and two samples, we let the encoder find the maximum value per feature and transform the data to a binary one-hot encoding. >>> from sklearn.preprocessing import OneHotEncoder >>> enc = OneHotEncoder() >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], \[1, 0, 2]]) # doctest: +ELLIPSIS OneHotEncoder(categorical_features='all', dtype=<... 'float'>, handle_unknown='error', n_values='auto', sparse=True) >>> enc.n_values_ array([2, 3, 4]) >>> enc.feature_indices_ array([0, 2, 5, 9]) >>> enc.transform([[0, 1, 1]]).toarray() array([[ 1., 0., 0., 1., 0., 0., 1., 0., 0.]]) See also -------- sklearn.feature_extraction.DictVectorizer : performs a one-hot encoding of dictionary items (also handles string-valued features). sklearn.feature_extraction.FeatureHasher : performs an approximate one-hot encoding of dictionary items or strings. """ def __init__(self, n_values="auto", categorical_features="all", dtype=np.float, sparse=True, handle_unknown='error'): self.n_values = n_values self.categorical_features = categorical_features self.dtype = dtype self.sparse = sparse self.handle_unknown = handle_unknown def fit(self, X, y=None): """Fit OneHotEncoder to X. Parameters ---------- X : array-like, shape=(n_samples, n_feature) Input array of type int. Returns ------- self """ self.fit_transform(X) return self def _fit_transform(self, X): """Assumes X contains only categorical features.""" X = check_array(X, dtype=np.int) if np.any(X < 0): raise ValueError("X needs to contain only non-negative integers.") n_samples, n_features = X.shape if self.n_values == 'auto': n_values = np.max(X, axis=0) + 1 elif isinstance(self.n_values, numbers.Integral): if (np.max(X, axis=0) >= self.n_values).any(): raise ValueError("Feature out of bounds for n_values=%d" % self.n_values) n_values = np.empty(n_features, dtype=np.int) n_values.fill(self.n_values) else: try: n_values = np.asarray(self.n_values, dtype=int) except (ValueError, TypeError): raise TypeError("Wrong type for parameter `n_values`. Expected" " 'auto', int or array of ints, got %r" % type(X)) if n_values.ndim < 1 or n_values.shape[0] != X.shape[1]: raise ValueError("Shape mismatch: if n_values is an array," " it has to be of shape (n_features,).") self.n_values_ = n_values n_values = np.hstack([[0], n_values]) indices = np.cumsum(n_values) self.feature_indices_ = indices column_indices = (X + indices[:-1]).ravel() row_indices = np.repeat(np.arange(n_samples, dtype=np.int32), n_features) data = np.ones(n_samples * n_features) out = sparse.coo_matrix((data, (row_indices, column_indices)), shape=(n_samples, indices[-1]), dtype=self.dtype).tocsr() if self.n_values == 'auto': mask = np.array(out.sum(axis=0)).ravel() != 0 active_features = np.where(mask)[0] out = out[:, active_features] self.active_features_ = active_features return out if self.sparse else out.toarray() def fit_transform(self, X, y=None): """Fit OneHotEncoder to X, then transform X. Equivalent to self.fit(X).transform(X), but more convenient and more efficient. See fit for the parameters, transform for the return value. """ return _transform_selected(X, self._fit_transform, self.categorical_features, copy=True) def _transform(self, X): """Assumes X contains only categorical features.""" X = check_array(X, dtype=np.int) if np.any(X < 0): raise ValueError("X needs to contain only non-negative integers.") n_samples, n_features = X.shape indices = self.feature_indices_ if n_features != indices.shape[0] - 1: raise ValueError("X has different shape than during fitting." " Expected %d, got %d." % (indices.shape[0] - 1, n_features)) # We use only those catgorical features of X that are known using fit. # i.e lesser than n_values_ using mask. # This means, if self.handle_unknown is "ignore", the row_indices and # col_indices corresponding to the unknown categorical feature are # ignored. mask = (X < self.n_values_).ravel() if np.any(~mask): if self.handle_unknown not in ['error', 'ignore']: raise ValueError("handle_unknown should be either error or " "unknown got %s" % self.handle_unknown) if self.handle_unknown == 'error': raise ValueError("unknown categorical feature present %s " "during transform." % X[~mask]) column_indices = (X + indices[:-1]).ravel()[mask] row_indices = np.repeat(np.arange(n_samples, dtype=np.int32), n_features)[mask] data = np.ones(np.sum(mask)) out = sparse.coo_matrix((data, (row_indices, column_indices)), shape=(n_samples, indices[-1]), dtype=self.dtype).tocsr() if self.n_values == 'auto': out = out[:, self.active_features_] return out if self.sparse else out.toarray() def transform(self, X): """Transform X using one-hot encoding. Parameters ---------- X : array-like, shape=(n_samples, n_features) Input array of type int. Returns ------- X_out : sparse matrix if sparse=True else a 2-d array, dtype=int Transformed input. """ return _transform_selected(X, self._transform, self.categorical_features, copy=True)
imputation.py
# Authors: Nicolas Tresegnie <nicolas.tresegnie@gmail.com># License: BSD 3 clauseimport warningsimport numpy as npimport numpy.ma as mafrom scipy import sparsefrom scipy import statsfrom ..base import BaseEstimator, TransformerMixinfrom ..utils import array2dfrom ..utils import atleast2d_or_csrfrom ..utils import atleast2d_or_cscfrom ..utils import as_float_arrayfrom ..utils.fixes import astypefrom ..externals import sixzip = six.moves.zipmap = six.moves.map__all__ = [ 'Imputer',]def _get_mask(X, value_to_mask): """Compute the boolean mask X == missing_values.""" if value_to_mask == "NaN" or np.isnan(value_to_mask): return np.isnan(X) else: return X == value_to_maskdef _get_median(data, n_zeros): """Compute the median of data with n_zeros additional zeros. This function is used to support sparse matrices; it modifies data in-place """ n_elems = len(data) + n_zeros if not n_elems: return np.nan n_negative = np.count_nonzero(data < 0) middle, is_odd = divmod(n_elems, 2) data.sort() if is_odd: return _get_elem_at_rank(middle, data, n_negative, n_zeros) return (_get_elem_at_rank(middle - 1, data, n_negative, n_zeros) + _get_elem_at_rank(middle, data, n_negative, n_zeros)) / 2.def _get_elem_at_rank(rank, data, n_negative, n_zeros): """Find the value in data augmented with n_zeros for the given rank""" if rank < n_negative: return data[rank] if rank - n_negative < n_zeros: return 0 return data[rank - n_zeros]def _most_frequent(array, extra_value, n_repeat): """Compute the most frequent value in a 1d array extended with [extra_value] * n_repeat, where extra_value is assumed to be not part of the array.""" # Compute the most frequent value in array only if array.size > 0: mode = stats.mode(array) most_frequent_value = mode[0][0] most_frequent_count = mode[1][0] else: most_frequent_value = 0 most_frequent_count = 0 # Compare to array + [extra_value] * n_repeat if most_frequent_count == 0 and n_repeat == 0: return np.nan elif most_frequent_count < n_repeat: return extra_value elif most_frequent_count > n_repeat: return most_frequent_value elif most_frequent_count == n_repeat: # Ties the breaks. Copy the behaviour of scipy.stats.mode if most_frequent_value < extra_value: return most_frequent_value else: return extra_valueclass Imputer(BaseEstimator, TransformerMixin): """Imputation transformer for completing missing values. Parameters ---------- missing_values : integer or "NaN", optional (default="NaN") The placeholder for the missing values. All occurrences of `missing_values` will be imputed. For missing values encoded as np.nan, use the string value "NaN". strategy : string, optional (default="mean") The imputation strategy. - If "mean", then replace missing values using the mean along the axis. - If "median", then replace missing values using the median along the axis. - If "most_frequent", then replace missing using the most frequent value along the axis. axis : integer, optional (default=0) The axis along which to impute. - If `axis=0`, then impute along columns. - If `axis=1`, then impute along rows. verbose : integer, optional (default=0) Controls the verbosity of the imputer. copy : boolean, optional (default=True) If True, a copy of X will be created. If False, imputation will be done in-place whenever possible. Note that, in the following cases, a new copy will always be made, even if `copy=False`: - If X is not an array of floating values; - If X is sparse and `missing_values=0`; - If `axis=0` and X is encoded as a CSR matrix; - If `axis=1` and X is encoded as a CSC matrix. Attributes ---------- `statistics_` : array of shape (n_features,) The imputation fill value for each feature if axis == 0. Notes ----- - When ``axis=0``, columns which only contained missing values at `fit` are discarded upon `transform`. - When ``axis=1``, an exception is raised if there are rows for which it is not possible to fill in the missing values (e.g., because they only contain missing values). """ def __init__(self, missing_values="NaN", strategy="mean", axis=0, verbose=0, copy=True): self.missing_values = missing_values self.strategy = strategy self.axis = axis self.verbose = verbose self.copy = copy def fit(self, X, y=None): """Fit the imputer on X. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Input data, where ``n_samples`` is the number of samples and ``n_features`` is the number of features. Returns ------- self : object Returns self. """ # Check parameters allowed_strategies = ["mean", "median", "most_frequent"] if self.strategy not in allowed_strategies: raise ValueError("Can only use these strategies: {0} " " got strategy={1}".format(allowed_strategies, self.strategy)) if self.axis not in [0, 1]: raise ValueError("Can only impute missing values on axis 0 and 1, " " got axis={0}".format(self.axis)) # Since two different arrays can be provided in fit(X) and # transform(X), the imputation data will be computed in transform() # when the imputation is done per sample (i.e., when axis=1). if self.axis == 0: X = atleast2d_or_csc(X, dtype=np.float64, force_all_finite=False) if sparse.issparse(X): self.statistics_ = self._sparse_fit(X, self.strategy, self.missing_values, self.axis) else: self.statistics_ = self._dense_fit(X, self.strategy, self.missing_values, self.axis) return self def _sparse_fit(self, X, strategy, missing_values, axis): """Fit the transformer on sparse data.""" # Imputation is done "by column", so if we want to do it # by row we only need to convert the matrix to csr format. if axis == 1: X = X.tocsr() else: X = X.tocsc() # Count the zeros if missing_values == 0: n_zeros_axis = np.zeros(X.shape[not axis], dtype=int) else: n_zeros_axis = X.shape[axis] - np.diff(X.indptr) # Mean if strategy == "mean": if missing_values != 0: n_non_missing = n_zeros_axis # Mask the missing elements mask_missing_values = _get_mask(X.data, missing_values) mask_valids = np.logical_not(mask_missing_values) # Sum only the valid elements new_data = X.data.copy() new_data[mask_missing_values] = 0 X = sparse.csc_matrix((new_data, X.indices, X.indptr), copy=False) sums = X.sum(axis=0) # Count the elements != 0 mask_non_zeros = sparse.csc_matrix( (mask_valids.astype(np.float64), X.indices, X.indptr), copy=False) s = mask_non_zeros.sum(axis=0) n_non_missing = np.add(n_non_missing, s) else: sums = X.sum(axis=axis) n_non_missing = np.diff(X.indptr) # Ignore the error, columns with a np.nan statistics_ # are not an error at this point. These columns will # be removed in transform with np.errstate(all="ignore"): return np.ravel(sums) / np.ravel(n_non_missing) # Median + Most frequent else: # Remove the missing values, for each column columns_all = np.hsplit(X.data, X.indptr[1:-1]) mask_missing_values = _get_mask(X.data, missing_values) mask_valids = np.hsplit(np.logical_not(mask_missing_values), X.indptr[1:-1]) # astype necessary for bug in numpy.hsplit before v1.9 columns = [col[astype(mask, bool, copy=False)] for col, mask in zip(columns_all, mask_valids)] # Median if strategy == "median": median = np.empty(len(columns)) for i, column in enumerate(columns): median[i] = _get_median(column, n_zeros_axis[i]) return median # Most frequent elif strategy == "most_frequent": most_frequent = np.empty(len(columns)) for i, column in enumerate(columns): most_frequent[i] = _most_frequent(column, 0, n_zeros_axis[i]) return most_frequent def _dense_fit(self, X, strategy, missing_values, axis): """Fit the transformer on dense data.""" X = array2d(X, force_all_finite=False) mask = _get_mask(X, missing_values) masked_X = ma.masked_array(X, mask=mask) # Mean if strategy == "mean": mean_masked = np.ma.mean(masked_X, axis=axis) # Avoid the warning "Warning: converting a masked element to nan." mean = np.ma.getdata(mean_masked) mean[np.ma.getmask(mean_masked)] = np.nan return mean # Median elif strategy == "median": if tuple(int(v) for v in np.__version__.split('.')[:2]) < (1, 5): # In old versions of numpy, calling a median on an array # containing nans returns nan. This is different is # recent versions of numpy, which we want to mimic masked_X.mask = np.logical_or(masked_X.mask, np.isnan(X)) median_masked = np.ma.median(masked_X, axis=axis) # Avoid the warning "Warning: converting a masked element to nan." median = np.ma.getdata(median_masked) median[np.ma.getmaskarray(median_masked)] = np.nan return median # Most frequent elif strategy == "most_frequent": # scipy.stats.mstats.mode cannot be used because it will no work # properly if the first element is masked and if it's frequency # is equal to the frequency of the most frequent valid element # See https://github.com/scipy/scipy/issues/2636 # To be able access the elements by columns if axis == 0: X = X.transpose() mask = mask.transpose() most_frequent = np.empty(X.shape[0]) for i, (row, row_mask) in enumerate(zip(X[:], mask[:])): row_mask = np.logical_not(row_mask).astype(np.bool) row = row[row_mask] most_frequent[i] = _most_frequent(row, np.nan, 0) return most_frequent def transform(self, X): """Impute all missing values in X. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] The input data to complete. """ # Copy just once X = as_float_array(X, copy=self.copy, force_all_finite=False) # Since two different arrays can be provided in fit(X) and # transform(X), the imputation data need to be recomputed # when the imputation is done per sample if self.axis == 1: X = atleast2d_or_csr(X, force_all_finite=False, copy=False) if sparse.issparse(X): statistics = self._sparse_fit(X, self.strategy, self.missing_values, self.axis) else: statistics = self._dense_fit(X, self.strategy, self.missing_values, self.axis) else: X = atleast2d_or_csc(X, force_all_finite=False, copy=False) statistics = self.statistics_ # Delete the invalid rows/columns invalid_mask = np.isnan(statistics) valid_mask = np.logical_not(invalid_mask) valid_statistics = statistics[valid_mask] valid_statistics_indexes = np.where(valid_mask)[0] missing = np.arange(X.shape[not self.axis])[invalid_mask] if self.axis == 0 and invalid_mask.any(): if self.verbose: warnings.warn("Deleting features without " "observed values: %s" % missing) X = X[:, valid_statistics_indexes] elif self.axis == 1 and invalid_mask.any(): raise ValueError("Some rows only contain " "missing values: %s" % missing) # Do actual imputation if sparse.issparse(X) and self.missing_values != 0: mask = _get_mask(X.data, self.missing_values) indexes = np.repeat(np.arange(len(X.indptr) - 1, dtype=np.int), np.diff(X.indptr))[mask] X.data[mask] = valid_statistics[indexes].astype(X.dtype) else: if sparse.issparse(X): X = X.toarray() mask = _get_mask(X, self.missing_values) n_missing = np.sum(mask, axis=self.axis) values = np.repeat(valid_statistics, n_missing) if self.axis == 0: coordinates = np.where(mask.transpose())[::-1] else: coordinates = mask X[coordinates] = values return X
label.py
# Authors: Alexandre Gramfort <alexandre.gramfort@inria.fr># Mathieu Blondel <mathieu@mblondel.org># Olivier Grisel <olivier.grisel@ensta.org># Andreas Mueller <amueller@ais.uni-bonn.de># Joel Nothman <joel.nothman@gmail.com># Hamzeh Alsalhi <ha258@cornell.edu># License: BSD 3 clausefrom collections import defaultdictimport itertoolsimport arrayimport warningsimport numpy as npimport scipy.sparse as spfrom ..base import BaseEstimator, TransformerMixinfrom ..utils.fixes import np_versionfrom ..utils.fixes import sparse_min_maxfrom ..utils.fixes import astypefrom ..utils.fixes import in1dfrom ..utils import deprecated, column_or_1dfrom ..utils.validation import check_arrayfrom ..utils.validation import _num_samplesfrom ..utils.multiclass import unique_labelsfrom ..utils.multiclass import type_of_targetfrom ..externals import sixzip = six.moves.zipmap = six.moves.map__all__ = [ 'label_binarize', 'LabelBinarizer', 'LabelEncoder', 'MultiLabelBinarizer',]def _check_numpy_unicode_bug(labels): """Check that user is not subject to an old numpy bug Fixed in master before 1.7.0: https://github.com/numpy/numpy/pull/243 """ if np_version[:3] < (1, 7, 0) and labels.dtype.kind == 'U': raise RuntimeError("NumPy < 1.7.0 does not implement searchsorted" " on unicode data correctly. Please upgrade" " NumPy to use LabelEncoder with unicode inputs.")class LabelEncoder(BaseEstimator, TransformerMixin): """Encode labels with value between 0 and n_classes-1. Attributes ---------- classes_ : array of shape (n_class,) Holds the label for each class. Examples -------- `LabelEncoder` can be used to normalize labels. >>> from sklearn import preprocessing >>> le = preprocessing.LabelEncoder() >>> le.fit([1, 2, 2, 6]) LabelEncoder() >>> le.classes_ array([1, 2, 6]) >>> le.transform([1, 1, 2, 6]) #doctest: +ELLIPSIS array([0, 0, 1, 2]...) >>> le.inverse_transform([0, 0, 1, 2]) array([1, 1, 2, 6]) It can also be used to transform non-numerical labels (as long as they are hashable and comparable) to numerical labels. >>> le = preprocessing.LabelEncoder() >>> le.fit(["paris", "paris", "tokyo", "amsterdam"]) LabelEncoder() >>> list(le.classes_) ['amsterdam', 'paris', 'tokyo'] >>> le.transform(["tokyo", "tokyo", "paris"]) #doctest: +ELLIPSIS array([2, 2, 1]...) >>> list(le.inverse_transform([2, 2, 1])) ['tokyo', 'tokyo', 'paris'] """ def _check_fitted(self): if not hasattr(self, "classes_"): raise ValueError("LabelEncoder was not fitted yet.") def fit(self, y): """Fit label encoder Parameters ---------- y : array-like of shape (n_samples,) Target values. Returns ------- self : returns an instance of self. """ y = column_or_1d(y, warn=True) _check_numpy_unicode_bug(y) self.classes_ = np.unique(y) return self def fit_transform(self, y): """Fit label encoder and return encoded labels Parameters ---------- y : array-like of shape [n_samples] Target values. Returns ------- y : array-like of shape [n_samples] """ y = column_or_1d(y, warn=True) _check_numpy_unicode_bug(y) self.classes_, y = np.unique(y, return_inverse=True) return y def transform(self, y): """Transform labels to normalized encoding. Parameters ---------- y : array-like of shape [n_samples] Target values. Returns ------- y : array-like of shape [n_samples] """ self._check_fitted() classes = np.unique(y) _check_numpy_unicode_bug(classes) if len(np.intersect1d(classes, self.classes_)) < len(classes): diff = np.setdiff1d(classes, self.classes_) raise ValueError("y contains new labels: %s" % str(diff)) return np.searchsorted(self.classes_, y) def inverse_transform(self, y): """Transform labels back to original encoding. Parameters ---------- y : numpy array of shape [n_samples] Target values. Returns ------- y : numpy array of shape [n_samples] """ self._check_fitted() y = np.asarray(y) return self.classes_[y]class LabelBinarizer(BaseEstimator, TransformerMixin): """Binarize labels in a one-vs-all fashion Several regression and binary classification algorithms are available in the scikit. A simple way to extend these algorithms to the multi-class classification case is to use the so-called one-vs-all scheme. At learning time, this simply consists in learning one regressor or binary classifier per class. In doing so, one needs to convert multi-class labels to binary labels (belong or does not belong to the class). LabelBinarizer makes this process easy with the transform method. At prediction time, one assigns the class for which the corresponding model gave the greatest confidence. LabelBinarizer makes this easy with the inverse_transform method. Parameters ---------- neg_label : int (default: 0) Value with which negative labels must be encoded. pos_label : int (default: 1) Value with which positive labels must be encoded. sparse_output : boolean (default: False) True if the returned array from transform is desired to be in sparse CSR format. Attributes ---------- classes_ : array of shape [n_class] Holds the label for each class. y_type_ : str, Represents the type of the target data as evaluated by utils.multiclass.type_of_target. Possible type are 'continuous', 'continuous-multioutput', 'binary', 'multiclass', 'mutliclass-multioutput', 'multilabel-sequences', 'multilabel-indicator', and 'unknown'. multilabel_ : boolean True if the transformer was fitted on a multilabel rather than a multiclass set of labels. The ``multilabel_`` attribute is deprecated and will be removed in 0.18 sparse_input_ : boolean, True if the input data to transform is given as a sparse matrix, False otherwise. indicator_matrix_ : str 'sparse' when the input data to tansform is a multilable-indicator and is sparse, None otherwise. The ``indicator_matrix_`` attribute is deprecated as of version 0.16 and will be removed in 0.18 Examples -------- >>> from sklearn import preprocessing >>> lb = preprocessing.LabelBinarizer() >>> lb.fit([1, 2, 6, 4, 2]) LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False) >>> lb.classes_ array([1, 2, 4, 6]) >>> lb.transform([1, 6]) array([[1, 0, 0, 0], [0, 0, 0, 1]]) Binary targets transform to a column vector >>> lb = preprocessing.LabelBinarizer() >>> lb.fit_transform(['yes', 'no', 'no', 'yes']) array([[1], [0], [0], [1]]) Passing a 2D matrix for multilabel classification >>> import numpy as np >>> lb.fit(np.array([[0, 1, 1], [1, 0, 0]])) LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False) >>> lb.classes_ array([0, 1, 2]) >>> lb.transform([0, 1, 2, 1]) array([[1, 0, 0], [0, 1, 0], [0, 0, 1], [0, 1, 0]]) See also -------- label_binarize : function to perform the transform operation of LabelBinarizer with fixed classes. """ def __init__(self, neg_label=0, pos_label=1, sparse_output=False): if neg_label >= pos_label: raise ValueError("neg_label={0} must be strictly less than " "pos_label={1}.".format(neg_label, pos_label)) if sparse_output and (pos_label == 0 or neg_label != 0): raise ValueError("Sparse binarization is only supported with non " "zero pos_label and zero neg_label, got " "pos_label={0} and neg_label={1}" "".format(pos_label, neg_label)) self.neg_label = neg_label self.pos_label = pos_label self.sparse_output = sparse_output @property @deprecated("Attribute ``indicator_matrix_`` is deprecated and will be " "removed in 0.17. Use ``y_type_ == 'multilabel-indicator'`` " "instead") def indicator_matrix_(self): return self.y_type_ == 'multilabel-indicator' @property @deprecated("Attribute ``multilabel_`` is deprecated and will be removed " "in 0.17. Use ``y_type_.startswith('multilabel')`` " "instead") def multilabel_(self): return self.y_type_.startswith('multilabel') def _check_fitted(self): if not hasattr(self, "classes_"): raise ValueError("LabelBinarizer was not fitted yet.") def fit(self, y): """Fit label binarizer Parameters ---------- y : numpy array of shape (n_samples,) or (n_samples, n_classes) Target values. The 2-d matrix should only contain 0 and 1, represents multilabel classification. Returns ------- self : returns an instance of self. """ self.y_type_ = type_of_target(y) if 'multioutput' in self.y_type_: raise ValueError("Multioutput target data is not supported with " "label binarization") if _num_samples(y) == 0: raise ValueError('y has 0 samples: %r' % y) self.sparse_input_ = sp.issparse(y) self.classes_ = unique_labels(y) return self def transform(self, y): """Transform multi-class labels to binary labels The output of transform is sometimes referred to by some authors as the 1-of-K coding scheme. Parameters ---------- y : numpy array or sparse matrix of shape (n_samples,) or (n_samples, n_classes) Target values. The 2-d matrix should only contain 0 and 1, represents multilabel classification. Sparse matrix can be CSR, CSC, COO, DOK, or LIL. Returns ------- Y : numpy array or CSR matrix of shape [n_samples, n_classes] Shape will be [n_samples, 1] for binary problems. """ self._check_fitted() y_is_multilabel = type_of_target(y).startswith('multilabel') if y_is_multilabel and not self.y_type_.startswith('multilabel'): raise ValueError("The object was not fitted with multilabel" " input.") return label_binarize(y, self.classes_, pos_label=self.pos_label, neg_label=self.neg_label, sparse_output=self.sparse_output) def inverse_transform(self, Y, threshold=None): """Transform binary labels back to multi-class labels Parameters ---------- Y : numpy array or sparse matrix with shape [n_samples, n_classes] Target values. All sparse matrices are converted to CSR before inverse transformation. threshold : float or None Threshold used in the binary and multi-label cases. Use 0 when: - Y contains the output of decision_function (classifier) Use 0.5 when: - Y contains the output of predict_proba If None, the threshold is assumed to be half way between neg_label and pos_label. Returns ------- y : numpy array or CSR matrix of shape [n_samples] Target values. Notes ----- In the case when the binary labels are fractional (probabilistic), inverse_transform chooses the class with the greatest value. Typically, this allows to use the output of a linear model's decision_function method directly as the input of inverse_transform. """ self._check_fitted() if threshold is None: threshold = (self.pos_label + self.neg_label) / 2. if self.y_type_ == "multiclass": y_inv = _inverse_binarize_multiclass(Y, self.classes_) else: y_inv = _inverse_binarize_thresholding(Y, self.y_type_, self.classes_, threshold) if self.sparse_input_: y_inv = sp.csr_matrix(y_inv) elif sp.issparse(y_inv): y_inv = y_inv.toarray() return y_invdef label_binarize(y, classes, neg_label=0, pos_label=1, sparse_output=False, multilabel=None): """Binarize labels in a one-vs-all fashion Several regression and binary classification algorithms are available in the scikit. A simple way to extend these algorithms to the multi-class classification case is to use the so-called one-vs-all scheme. This function makes it possible to compute this transformation for a fixed set of class labels known ahead of time. Parameters ---------- y : array-like Sequence of integer labels or multilabel data to encode. classes : array-like of shape [n_classes] Uniquely holds the label for each class. neg_label : int (default: 0) Value with which negative labels must be encoded. pos_label : int (default: 1) Value with which positive labels must be encoded. sparse_output : boolean (default: False), Set to true if output binary array is desired in CSR sparse format Returns ------- Y : numpy array or CSR matrix of shape [n_samples, n_classes] Shape will be [n_samples, 1] for binary problems. Examples -------- >>> from sklearn.preprocessing import label_binarize >>> label_binarize([1, 6], classes=[1, 2, 4, 6]) array([[1, 0, 0, 0], [0, 0, 0, 1]]) The class ordering is preserved: >>> label_binarize([1, 6], classes=[1, 6, 4, 2]) array([[1, 0, 0, 0], [0, 1, 0, 0]]) Binary targets transform to a column vector >>> label_binarize(['yes', 'no', 'no', 'yes'], classes=['no', 'yes']) array([[1], [0], [0], [1]]) See also -------- LabelBinarizer : class used to wrap the functionality of label_binarize and allow for fitting to classes independently of the transform operation """ if not isinstance(y, list): # XXX Workaround that will be removed when list of list format is # dropped y = check_array(y, accept_sparse='csr', ensure_2d=False, dtype=None) else: if _num_samples(y) == 0: raise ValueError('y has 0 samples: %r' % y) if neg_label >= pos_label: raise ValueError("neg_label={0} must be strictly less than " "pos_label={1}.".format(neg_label, pos_label)) if (sparse_output and (pos_label == 0 or neg_label != 0)): raise ValueError("Sparse binarization is only supported with non " "zero pos_label and zero neg_label, got " "pos_label={0} and neg_label={1}" "".format(pos_label, neg_label)) if multilabel is not None: warnings.warn("The multilabel parameter is deprecated as of version " "0.15 and will be removed in 0.17. The parameter is no " "longer necessary because the value is automatically " "inferred.", DeprecationWarning) # To account for pos_label == 0 in the dense case pos_switch = pos_label == 0 if pos_switch: pos_label = -neg_label y_type = type_of_target(y) if 'multioutput' in y_type: raise ValueError("Multioutput target data is not supported with label " "binarization") n_samples = y.shape[0] if sp.issparse(y) else len(y) n_classes = len(classes) classes = np.asarray(classes) if y_type == "binary": if len(classes) == 1: Y = np.zeros((len(y), 1), dtype=np.int) Y += neg_label return Y elif len(classes) >= 3: y_type = "multiclass" sorted_class = np.sort(classes) if (y_type == "multilabel-indicator" and classes.size != y.shape[1]): raise ValueError("classes {0} missmatch with the labels {1}" "found in the data".format(classes, unique_labels(y))) if y_type in ("binary", "multiclass"): y = column_or_1d(y) # pick out the known labels from y y_in_classes = in1d(y, classes) y_seen = y[y_in_classes] indices = np.searchsorted(sorted_class, y_seen) indptr = np.hstack((0, np.cumsum(y_in_classes))) data = np.empty_like(indices) data.fill(pos_label) Y = sp.csr_matrix((data, indices, indptr), shape=(n_samples, n_classes)) elif y_type == "multilabel-indicator": Y = sp.csr_matrix(y) if pos_label != 1: data = np.empty_like(Y.data) data.fill(pos_label) Y.data = data elif y_type == "multilabel-sequences": Y = MultiLabelBinarizer(classes=classes, sparse_output=sparse_output).fit_transform(y) if sp.issparse(Y): Y.data[:] = pos_label else: Y[Y == 1] = pos_label return Y if not sparse_output: Y = Y.toarray() Y = astype(Y, int, copy=False) if neg_label != 0: Y[Y == 0] = neg_label if pos_switch: Y[Y == pos_label] = 0 else: Y.data = astype(Y.data, int, copy=False) # preserve label ordering if np.any(classes != sorted_class): indices = np.searchsorted(sorted_class, classes) Y = Y[:, indices] if y_type == "binary": if sparse_output: Y = Y.getcol(-1) else: Y = Y[:, -1].reshape((-1, 1)) return Ydef _inverse_binarize_multiclass(y, classes): """Inverse label binarization transformation for multiclass. Multiclass uses the maximal score instead of a threshold. """ classes = np.asarray(classes) if sp.issparse(y): # Find the argmax for each row in y where y is a CSR matrix y = y.tocsr() n_samples, n_outputs = y.shape outputs = np.arange(n_outputs) row_max = sparse_min_max(y, 1)[1] row_nnz = np.diff(y.indptr) y_data_repeated_max = np.repeat(row_max, row_nnz) # picks out all indices obtaining the maximum per row y_i_all_argmax = np.flatnonzero(y_data_repeated_max == y.data) # For corner case where last row has a max of 0 if row_max[-1] == 0: y_i_all_argmax = np.append(y_i_all_argmax, [len(y.data)]) # Gets the index of the first argmax in each row from y_i_all_argmax index_first_argmax = np.searchsorted(y_i_all_argmax, y.indptr[:-1]) # first argmax of each row y_ind_ext = np.append(y.indices, [0]) y_i_argmax = y_ind_ext[y_i_all_argmax[index_first_argmax]] # Handle rows of all 0 y_i_argmax[np.where(row_nnz == 0)[0]] = 0 # Handles rows with max of 0 that contain negative numbers samples = np.arange(n_samples)[(row_nnz > 0) & (row_max.ravel() == 0)] for i in samples: ind = y.indices[y.indptr[i]:y.indptr[i + 1]] y_i_argmax[i] = classes[np.setdiff1d(outputs, ind)][0] return classes[y_i_argmax] else: return classes.take(y.argmax(axis=1), mode="clip")def _inverse_binarize_thresholding(y, output_type, classes, threshold): """Inverse label binarization transformation using thresholding.""" if output_type == "binary" and y.ndim == 2 and y.shape[1] > 2: raise ValueError("output_type='binary', but y.shape = {0}". format(y.shape)) if output_type != "binary" and y.shape[1] != len(classes): raise ValueError("The number of class is not equal to the number of " "dimension of y.") classes = np.asarray(classes) # Perform thresholding if sp.issparse(y): if threshold > 0: if y.format not in ('csr', 'csc'): y = y.tocsr() y.data = np.array(y.data > threshold, dtype=np.int) y.eliminate_zeros() else: y = np.array(y.toarray() > threshold, dtype=np.int) else: y = np.array(y > threshold, dtype=np.int) # Inverse transform data if output_type == "binary": if sp.issparse(y): y = y.toarray() if y.ndim == 2 and y.shape[1] == 2: return classes[y[:, 1]] else: if len(classes) == 1: y = np.empty(len(y), dtype=classes.dtype) y.fill(classes[0]) return y else: return classes[y.ravel()] elif output_type == "multilabel-indicator": return y elif output_type == "multilabel-sequences": warnings.warn('Direct support for sequence of sequences multilabel ' 'representation will be unavailable from version 0.17. ' 'Use sklearn.preprocessing.MultiLabelBinarizer to ' 'convert to a label indicator representation.', DeprecationWarning) mlb = MultiLabelBinarizer(classes=classes).fit([]) return mlb.inverse_transform(y) else: raise ValueError("{0} format is not supported".format(output_type))class MultiLabelBinarizer(BaseEstimator, TransformerMixin): """Transform between iterable of iterables and a multilabel format Although a list of sets or tuples is a very intuitive format for multilabel data, it is unwieldy to process. This transformer converts between this intuitive format and the supported multilabel format: a (samples x classes) binary matrix indicating the presence of a class label. Parameters ---------- classes : array-like of shape [n_classes] (optional) Indicates an ordering for the class labels sparse_output : boolean (default: False), Set to true if output binary array is desired in CSR sparse format Attributes ---------- classes_ : array of labels A copy of the `classes` parameter where provided, or otherwise, the sorted set of classes found when fitting. Examples -------- >>> mlb = MultiLabelBinarizer() >>> mlb.fit_transform([(1, 2), (3,)]) array([[1, 1, 0], [0, 0, 1]]) >>> mlb.classes_ array([1, 2, 3]) >>> mlb.fit_transform([set(['sci-fi', 'thriller']), set(['comedy'])]) array([[0, 1, 1], [1, 0, 0]]) >>> list(mlb.classes_) ['comedy', 'sci-fi', 'thriller'] """ def __init__(self, classes=None, sparse_output=False): self.classes = classes self.sparse_output = sparse_output def fit(self, y): """Fit the label sets binarizer, storing `classes_` Parameters ---------- y : iterable of iterables A set of labels (any orderable and hashable object) for each sample. If the `classes` parameter is set, `y` will not be iterated. Returns ------- self : returns this MultiLabelBinarizer instance """ if self.classes is None: classes = sorted(set(itertools.chain.from_iterable(y))) else: classes = self.classes dtype = np.int if all(isinstance(c, int) for c in classes) else object self.classes_ = np.empty(len(classes), dtype=dtype) self.classes_[:] = classes return self def fit_transform(self, y): """Fit the label sets binarizer and transform the given label sets Parameters ---------- y : iterable of iterables A set of labels (any orderable and hashable object) for each sample. If the `classes` parameter is set, `y` will not be iterated. Returns ------- y_indicator : array or CSR matrix, shape (n_samples, n_classes) A matrix such that `y_indicator[i, j] = 1` iff `classes_[j]` is in `y[i]`, and 0 otherwise. """ if self.classes is not None: return self.fit(y).transform(y) # Automatically increment on new class class_mapping = defaultdict(int) class_mapping.default_factory = class_mapping.__len__ yt = self._transform(y, class_mapping) # sort classes and reorder columns tmp = sorted(class_mapping, key=class_mapping.get) # (make safe for tuples) dtype = np.int if all(isinstance(c, int) for c in tmp) else object class_mapping = np.empty(len(tmp), dtype=dtype) class_mapping[:] = tmp self.classes_, inverse = np.unique(class_mapping, return_inverse=True) yt.indices = np.take(inverse, yt.indices) if not self.sparse_output: yt = yt.toarray() return yt def transform(self, y): """Transform the given label sets Parameters ---------- y : iterable of iterables A set of labels (any orderable and hashable object) for each sample. If the `classes` parameter is set, `y` will not be iterated. Returns ------- y_indicator : array or CSR matrix, shape (n_samples, n_classes) A matrix such that `y_indicator[i, j] = 1` iff `classes_[j]` is in `y[i]`, and 0 otherwise. """ class_to_index = dict(zip(self.classes_, range(len(self.classes_)))) yt = self._transform(y, class_to_index) if not self.sparse_output: yt = yt.toarray() return yt def _transform(self, y, class_mapping): """Transforms the label sets with a given mapping Parameters ---------- y : iterable of iterables class_mapping : Mapping Maps from label to column index in label indicator matrix Returns ------- y_indicator : sparse CSR matrix, shape (n_samples, n_classes) Label indicator matrix """ indices = array.array('i') indptr = array.array('i', [0]) for labels in y: indices.extend(set(class_mapping[label] for label in labels)) indptr.append(len(indices)) data = np.ones(len(indices), dtype=int) return sp.csr_matrix((data, indices, indptr), shape=(len(indptr) - 1, len(class_mapping))) def inverse_transform(self, yt): """Transform the given indicator matrix into label sets Parameters ---------- yt : array or sparse matrix of shape (n_samples, n_classes) A matrix containing only 1s ands 0s. Returns ------- y : list of tuples The set of labels for each sample such that `y[i]` consists of `classes_[j]` for each `yt[i, j] == 1`. """ if yt.shape[1] != len(self.classes_): raise ValueError('Expected indicator for {0} classes, but got {1}' .format(len(self.classes_), yt.shape[1])) if sp.issparse(yt): yt = yt.tocsr() if len(yt.data) != 0 and len(np.setdiff1d(yt.data, [0, 1])) > 0: raise ValueError('Expected only 0s and 1s in label indicator.') return [tuple(self.classes_.take(yt.indices[start:end])) for start, end in zip(yt.indptr[:-1], yt.indptr[1:])] else: unexpected = np.setdiff1d(yt, [0, 1]) if len(unexpected) > 0: raise ValueError('Expected only 0s and 1s in label indicator. ' 'Also got {0}'.format(unexpected)) return [tuple(self.classes_.compress(indicators)) for indicators in yt]
0 0
- sklearn preprocessing
- sklearn-Preprocessing
- Sklearn-preprocessing.PolynomialFeatures
- sklearn.preprocessing.PolynomialFeatures 用法
- sklearn.preprocessing.PolynomialFeatures
- sklearn.preprocessing.Binarizer
- sklearn.preprocessing.OneHotEncoder
- sklearn.preprocessing.Imputer
- sklearn.preprocessing.LabelEncoder
- sklearn.preprocessing.Normalizer
- sklearn.preprocessing.LabelBinarizer
- sklearn.preprocessing.MultiLabelBinarizer
- PYTHON-sklearn.preprocessing
- sklearn.preprocessing.LabelEncoder
- Preprocessing data-sklearn数据预处理
- Sklearn-preprocessing.scale/StandardScaler/MinMaxScaler
- sklearn中数据预处理(preprocessing)
- sklearn.preprocessing的部分用法
- 并查集
- libsvm Guide
- Leetcode99: Combinations
- jsonp应用
- 利用SpannableStringBuilder设置TextView中部分文字的颜色
- sklearn preprocessing
- 一篇很全面的freemarker教程
- Xcode7真机调试iOS应用程序
- 关于应用第一次安装打开,点击home键,然后再点击图标打开时,重复打开页面问题
- Java学习资源
- mysql 学习---->数值计算、逻辑判断、范围选择、位运算
- java_web转发和重定向的区别
- 输入流和输出流
- dao层的实现类中的配置BaseDao,所有的DaoImpl可以继承此类