sklearn preprocessing

来源:互联网 发布:机顶盒打开软件下载 编辑:程序博客网 时间:2024/05/21 10:43

sklearn preprocessing代码

代码来自Anaconda软件里sklearn模块

init.py

"""The :mod:`sklearn.preprocessing` module includes scaling, centering,normalization, binarization and imputation methods."""from .data import Binarizerfrom .data import KernelCentererfrom .data import MinMaxScalerfrom .data import Normalizerfrom .data import StandardScalerfrom .data import add_dummy_featurefrom .data import binarizefrom .data import normalizefrom .data import scalefrom .data import OneHotEncoderfrom .data import PolynomialFeaturesfrom .label import label_binarizefrom .label import LabelBinarizerfrom .label import LabelEncoderfrom .label import MultiLabelBinarizerfrom .imputation import Imputer__all__ = [    'Binarizer',    'Imputer',    'KernelCenterer',    'LabelBinarizer',    'LabelEncoder',    'MultiLabelBinarizer',    'MinMaxScaler',    'Normalizer',    'OneHotEncoder',    'StandardScaler',    'add_dummy_feature',    'PolynomialFeatures',    'binarize',    'normalize',    'scale',    'label_binarize',]

_weights.py

import numpy as npfrom ..utils.fixes import bincountdef _balance_weights(y):    """Compute sample weights such that the class distribution of y becomes       balanced.    Parameters    ----------    y : array-like        Labels for the samples.    Returns    -------    weights : array-like        The sample weights.    """    y = np.asarray(y)    y = np.searchsorted(np.unique(y), y)    bins = bincount(y)    weights = 1. / bins.take(y)    weights *= bins.min()    return weights

data.py

# Authors: Alexandre Gramfort <alexandre.gramfort@inria.fr>#          Mathieu Blondel <mathieu@mblondel.org>#          Olivier Grisel <olivier.grisel@ensta.org>#          Andreas Mueller <amueller@ais.uni-bonn.de>#          Eric Martin <eric@ericmart.in># License: BSD 3 clausefrom itertools import chain, combinationsimport numbersimport warningsimport numpy as npfrom scipy import sparsefrom ..base import BaseEstimator, TransformerMixinfrom ..externals import sixfrom ..utils import check_arrayfrom ..utils import warn_if_not_floatfrom ..utils.extmath import row_normsfrom ..utils.fixes import (combinations_with_replacement as combinations_w_r,                           bincount)from ..utils.fixes import isclosefrom ..utils.sparsefuncs_fast import (inplace_csr_row_normalize_l1,                                      inplace_csr_row_normalize_l2)from ..utils.sparsefuncs import (inplace_column_scale, mean_variance_axis)from ..utils.validation import check_is_fittedzip = six.moves.zipmap = six.moves.maprange = six.moves.range__all__ = [    'Binarizer',    'KernelCenterer',    'MinMaxScaler',    'Normalizer',    'OneHotEncoder',    'StandardScaler',    'add_dummy_feature',    'binarize',    'normalize',    'scale',]def _mean_and_std(X, axis=0, with_mean=True, with_std=True):    """Compute mean and std deviation for centering, scaling.    Zero valued std components are reset to 1.0 to avoid NaNs when scaling.    """    X = np.asarray(X)    Xr = np.rollaxis(X, axis)    if with_mean:        mean_ = Xr.mean(axis=0)    else:        mean_ = None    if with_std:        std_ = Xr.std(axis=0)        if isinstance(std_, np.ndarray):            std_[std_ == 0.] = 1.0        elif std_ == 0.:            std_ = 1.    else:        std_ = None    return mean_, std_def scale(X, axis=0, with_mean=True, with_std=True, copy=True):    """Standardize a dataset along any axis    Center to the mean and component wise scale to unit variance.    Parameters    ----------    X : array-like or CSR matrix.        The data to center and scale.    axis : int (0 by default)        axis used to compute the means and standard deviations along. If 0,        independently standardize each feature, otherwise (if 1) standardize        each sample.    with_mean : boolean, True by default        If True, center the data before scaling.    with_std : boolean, True by default        If True, scale the data to unit variance (or equivalently,        unit standard deviation).    copy : boolean, optional, default True        set to False to perform inplace row normalization and avoid a        copy (if the input is already a numpy array or a scipy.sparse        CSR matrix and if axis is 1).    Notes    -----    This implementation will refuse to center scipy.sparse matrices    since it would make them non-sparse and would potentially crash the    program with memory exhaustion problems.    Instead the caller is expected to either set explicitly    `with_mean=False` (in that case, only variance scaling will be    performed on the features of the CSR matrix) or to call `X.toarray()`    if he/she expects the materialized dense array to fit in memory.    To avoid memory copy the caller should pass a CSR matrix.    See also    --------    :class:`sklearn.preprocessing.StandardScaler` to perform centering and    scaling using the ``Transformer`` API (e.g. as part of a preprocessing    :class:`sklearn.pipeline.Pipeline`)    """    X = check_array(X, accept_sparse='csr', copy=copy, ensure_2d=False)    warn_if_not_float(X, estimator='The scale function')    if sparse.issparse(X):        if with_mean:            raise ValueError(                "Cannot center sparse matrices: pass `with_mean=False` instead"                " See docstring for motivation and alternatives.")        if axis != 0:            raise ValueError("Can only scale sparse matrix on axis=0, "                             " got axis=%d" % axis)        if not sparse.isspmatrix_csr(X):            X = X.tocsr()            copy = False        if copy:            X = X.copy()        _, var = mean_variance_axis(X, axis=0)        var[var == 0.0] = 1.0        inplace_column_scale(X, 1 / np.sqrt(var))    else:        X = np.asarray(X)        mean_, std_ = _mean_and_std(            X, axis, with_mean=with_mean, with_std=with_std)        if copy:            X = X.copy()        # Xr is a view on the original array that enables easy use of        # broadcasting on the axis in which we are interested in        Xr = np.rollaxis(X, axis)        if with_mean:            Xr -= mean_            mean_1 = Xr.mean(axis=0)            # Verify that mean_1 is 'close to zero'. If X contains very            # large values, mean_1 can also be very large, due to a lack of            # precision of mean_. In this case, a pre-scaling of the            # concerned feature is efficient, for instance by its mean or            # maximum.            if not np.allclose(mean_1, 0):                warnings.warn("Numerical issues were encountered "                              "when centering the data "                              "and might not be solved. Dataset may "                              "contain too large values. You may need "                              "to prescale your features.")                Xr -= mean_1        if with_std:            Xr /= std_            if with_mean:                mean_2 = Xr.mean(axis=0)                # If mean_2 is not 'close to zero', it comes from the fact that                # std_ is very small so that mean_2 = mean_1/std_ > 0, even if                # mean_1 was close to zero. The problem is thus essentially due                # to the lack of precision of mean_. A solution is then to                # substract the mean again:                if not np.allclose(mean_2, 0):                    warnings.warn("Numerical issues were encountered "                                  "when scaling the data "                                  "and might not be solved. The standard "                                  "deviation of the data is probably "                                  "very close to 0. ")                    Xr -= mean_2    return Xclass MinMaxScaler(BaseEstimator, TransformerMixin):    """Standardizes features by scaling each feature to a given range.    This estimator scales and translates each feature individually such    that it is in the given range on the training set, i.e. between    zero and one.    The standardization is given by::        X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))        X_scaled = X_std * (max - min) + min    where min, max = feature_range.    This standardization is often used as an alternative to zero mean,    unit variance scaling.    Parameters    ----------    feature_range: tuple (min, max), default=(0, 1)        Desired range of transformed data.    copy : boolean, optional, default True        Set to False to perform inplace row normalization and avoid a        copy (if the input is already a numpy array).    Attributes    ----------    min_ : ndarray, shape (n_features,)        Per feature adjustment for minimum.    scale_ : ndarray, shape (n_features,)        Per feature relative scaling of the data.    """    def __init__(self, feature_range=(0, 1), copy=True):        self.feature_range = feature_range        self.copy = copy    def fit(self, X, y=None):        """Compute the minimum and maximum to be used for later scaling.        Parameters        ----------        X : array-like, shape [n_samples, n_features]            The data used to compute the per-feature minimum and maximum            used for later scaling along the features axis.        """        X = check_array(X, copy=self.copy, ensure_2d=False)        warn_if_not_float(X, estimator=self)        feature_range = self.feature_range        if feature_range[0] >= feature_range[1]:            raise ValueError("Minimum of desired feature range must be smaller"                             " than maximum. Got %s." % str(feature_range))        data_min = np.min(X, axis=0)        data_range = np.max(X, axis=0) - data_min        # Do not scale constant features        if isinstance(data_range, np.ndarray):            data_range[data_range == 0.0] = 1.0        elif data_range == 0.:            data_range = 1.        self.scale_ = (feature_range[1] - feature_range[0]) / data_range        self.min_ = feature_range[0] - data_min * self.scale_        self.data_range = data_range        self.data_min = data_min        return self    def transform(self, X):        """Scaling features of X according to feature_range.        Parameters        ----------        X : array-like with shape [n_samples, n_features]            Input data that will be transformed.        """        check_is_fitted(self, 'scale_')        X = check_array(X, copy=self.copy, ensure_2d=False)        X *= self.scale_        X += self.min_        return X    def inverse_transform(self, X):        """Undo the scaling of X according to feature_range.        Parameters        ----------        X : array-like with shape [n_samples, n_features]            Input data that will be transformed.        """        check_is_fitted(self, 'scale_')        X = check_array(X, copy=self.copy, ensure_2d=False)        X -= self.min_        X /= self.scale_        return Xclass StandardScaler(BaseEstimator, TransformerMixin):    """Standardize features by removing the mean and scaling to unit variance    Centering and scaling happen independently on each feature by computing    the relevant statistics on the samples in the training set. Mean and    standard deviation are then stored to be used on later data using the    `transform` method.    Standardization of a dataset is a common requirement for many    machine learning estimators: they might behave badly if the    individual feature do not more or less look like standard normally    distributed data (e.g. Gaussian with 0 mean and unit variance).    For instance many elements used in the objective function of    a learning algorithm (such as the RBF kernel of Support Vector    Machines or the L1 and L2 regularizers of linear models) assume that    all features are centered around 0 and have variance in the same    order. If a feature has a variance that is orders of magnitude larger    that others, it might dominate the objective function and make the    estimator unable to learn from other features correctly as expected.    Parameters    ----------    with_mean : boolean, True by default        If True, center the data before scaling.        This does not work (and will raise an exception) when attempted on        sparse matrices, because centering them entails building a dense        matrix which in common use cases is likely to be too large to fit in        memory.    with_std : boolean, True by default        If True, scale the data to unit variance (or equivalently,        unit standard deviation).    copy : boolean, optional, default True        If False, try to avoid a copy and do inplace scaling instead.        This is not guaranteed to always work inplace; e.g. if the data is        not a NumPy array or scipy.sparse CSR matrix, a copy may still be        returned.    Attributes    ----------    mean_ : array of floats with shape [n_features]        The mean value for each feature in the training set.    std_ : array of floats with shape [n_features]        The standard deviation for each feature in the training set.    See also    --------    :func:`sklearn.preprocessing.scale` to perform centering and    scaling without using the ``Transformer`` object oriented API    :class:`sklearn.decomposition.RandomizedPCA` with `whiten=True`    to further remove the linear correlation across features.    """    def __init__(self, copy=True, with_mean=True, with_std=True):        self.with_mean = with_mean        self.with_std = with_std        self.copy = copy    def fit(self, X, y=None):        """Compute the mean and std to be used for later scaling.        Parameters        ----------        X : array-like or CSR matrix with shape [n_samples, n_features]            The data used to compute the mean and standard deviation            used for later scaling along the features axis.        """        X = check_array(X, accept_sparse='csr', copy=self.copy,                        ensure_2d=False)        if warn_if_not_float(X, estimator=self):            X = X.astype(np.float)        if sparse.issparse(X):            if self.with_mean:                raise ValueError(                    "Cannot center sparse matrices: pass `with_mean=False` "                    "instead. See docstring for motivation and alternatives.")            self.mean_ = None            if self.with_std:                var = mean_variance_axis(X, axis=0)[1]                self.std_ = np.sqrt(var)                self.std_[var == 0.0] = 1.0            else:                self.std_ = None            return self        else:            self.mean_, self.std_ = _mean_and_std(                X, axis=0, with_mean=self.with_mean, with_std=self.with_std)            return self    def transform(self, X, y=None, copy=None):        """Perform standardization by centering and scaling        Parameters        ----------        X : array-like with shape [n_samples, n_features]            The data used to scale along the features axis.        """        check_is_fitted(self, 'std_')        copy = copy if copy is not None else self.copy        X = check_array(X, accept_sparse='csr', copy=copy, ensure_2d=False)        if warn_if_not_float(X, estimator=self):            X = X.astype(np.float)        if sparse.issparse(X):            if self.with_mean:                raise ValueError(                    "Cannot center sparse matrices: pass `with_mean=False` "                    "instead. See docstring for motivation and alternatives.")            if self.std_ is not None:                inplace_column_scale(X, 1 / self.std_)        else:            if self.with_mean:                X -= self.mean_            if self.with_std:                X /= self.std_        return X    def inverse_transform(self, X, copy=None):        """Scale back the data to the original representation        Parameters        ----------        X : array-like with shape [n_samples, n_features]            The data used to scale along the features axis.        """        check_is_fitted(self, 'std_')        copy = copy if copy is not None else self.copy        if sparse.issparse(X):            if self.with_mean:                raise ValueError(                    "Cannot uncenter sparse matrices: pass `with_mean=False` "                    "instead See docstring for motivation and alternatives.")            if not sparse.isspmatrix_csr(X):                X = X.tocsr()                copy = False            if copy:                X = X.copy()            if self.std_ is not None:                inplace_column_scale(X, self.std_)        else:            X = np.asarray(X)            if copy:                X = X.copy()            if self.with_std:                X *= self.std_            if self.with_mean:                X += self.mean_        return Xclass PolynomialFeatures(BaseEstimator, TransformerMixin):    """Generate polynomial and interaction features.    Generate a new feature matrix consisting of all polynomial combinations    of the features with degree less than or equal to the specified degree.    For example, if an input sample is two dimensional and of the form    [a, b], the degree-2 polynomial features are [1, a, b, a^2, ab, b^2].    Parameters    ----------    degree : integer        The degree of the polynomial features. Default = 2.    interaction_only : boolean, default = False        If true, only interaction features are produced: features that are        products of at most ``degree`` *distinct* input features (so not        ``x[1] ** 2``, ``x[0] * x[2] ** 3``, etc.).    include_bias : boolean        If True (default), then include a bias column, the feature in which        all polynomial powers are zero (i.e. a column of ones - acts as an        intercept term in a linear model).    Examples    --------    >>> X = np.arange(6).reshape(3, 2)    >>> X    array([[0, 1],           [2, 3],           [4, 5]])    >>> poly = PolynomialFeatures(2)    >>> poly.fit_transform(X)    array([[ 1,  0,  1,  0,  0,  1],           [ 1,  2,  3,  4,  6,  9],           [ 1,  4,  5, 16, 20, 25]])    >>> poly = PolynomialFeatures(interaction_only=True)    >>> poly.fit_transform(X)    array([[ 1,  0,  1,  0],           [ 1,  2,  3,  6],           [ 1,  4,  5, 20]])    Attributes    ----------    powers_ : array, shape (n_input_features, n_output_features)        powers_[i, j] is the exponent of the jth input in the ith output.    n_input_features_ : int        The total number of input features.    n_output_features_ : int        The total number of polynomial output features. The number of output        features is computed by iterating over all suitably sized combinations        of input features.    Notes    -----    Be aware that the number of features in the output array scales    polynomially in the number of features of the input array, and    exponentially in the degree. High degrees can cause overfitting.    See :ref:`examples/linear_model/plot_polynomial_interpolation.py    <example_linear_model_plot_polynomial_interpolation.py>`    """    def __init__(self, degree=2, interaction_only=False, include_bias=True):        self.degree = degree        self.interaction_only = interaction_only        self.include_bias = include_bias    @staticmethod    def _combinations(n_features, degree, interaction_only, include_bias):        comb = (combinations if interaction_only else combinations_w_r)        start = int(not include_bias)        return chain.from_iterable(comb(range(n_features), i)                                   for i in range(start, degree + 1))    @property    def powers_(self):        check_is_fitted(self, 'n_input_features_')        combinations = self._combinations(self.n_input_features_, self.degree,                                          self.interaction_only,                                          self.include_bias)        return np.vstack(np.bincount(c, minlength=self.n_input_features_)                         for c in combinations)    def fit(self, X, y=None):        """        Compute number of output features.        """        n_samples, n_features = check_array(X).shape        combinations = self._combinations(n_features, self.degree,                                          self.interaction_only,                                          self.include_bias)        self.n_input_features_ = n_features        self.n_output_features_ = sum(1 for _ in combinations)        return self    def transform(self, X, y=None):        """Transform data to polynomial features        Parameters        ----------        X : array with shape [n_samples, n_features]            The data to transform, row by row.        Returns        -------        XP : np.ndarray shape [n_samples, NP]            The matrix of features, where NP is the number of polynomial            features generated from the combination of inputs.        """        check_is_fitted(self, ['n_input_features_', 'n_output_features_'])        X = check_array(X)        n_samples, n_features = X.shape        if n_features != self.n_input_features_:            raise ValueError("X shape does not match training shape")        # allocate output data        XP = np.empty((n_samples, self.n_output_features_), dtype=X.dtype)        combinations = self._combinations(n_features, self.degree,                                          self.interaction_only,                                          self.include_bias)        for i, c in enumerate(combinations):            XP[:, i] = X[:, c].prod(1)        return XPdef normalize(X, norm='l2', axis=1, copy=True):    """Scale input vectors individually to unit norm (vector length).    Parameters    ----------    X : array or scipy.sparse matrix with shape [n_samples, n_features]        The data to normalize, element by element.        scipy.sparse matrices should be in CSR format to avoid an        un-necessary copy.    norm : 'l1' or 'l2', optional ('l2' by default)        The norm to use to normalize each non zero sample (or each non-zero        feature if axis is 0).    axis : 0 or 1, optional (1 by default)        axis used to normalize the data along. If 1, independently normalize        each sample, otherwise (if 0) normalize each feature.    copy : boolean, optional, default True        set to False to perform inplace row normalization and avoid a        copy (if the input is already a numpy array or a scipy.sparse        CSR matrix and if axis is 1).    See also    --------    :class:`sklearn.preprocessing.Normalizer` to perform normalization    using the ``Transformer`` API (e.g. as part of a preprocessing    :class:`sklearn.pipeline.Pipeline`)    """    if norm not in ('l1', 'l2'):        raise ValueError("'%s' is not a supported norm" % norm)    if axis == 0:        sparse_format = 'csc'    elif axis == 1:        sparse_format = 'csr'    else:        raise ValueError("'%d' is not a supported axis" % axis)    X = check_array(X, sparse_format, copy=copy)    warn_if_not_float(X, 'The normalize function')    if axis == 0:        X = X.T    if sparse.issparse(X):        if norm == 'l1':            inplace_csr_row_normalize_l1(X)        elif norm == 'l2':            inplace_csr_row_normalize_l2(X)    else:        if norm == 'l1':            norms = np.abs(X).sum(axis=1)            norms[norms == 0.0] = 1.0        elif norm == 'l2':            norms = row_norms(X)            norms[norms == 0.0] = 1.0        X /= norms[:, np.newaxis]    if axis == 0:        X = X.T    return Xclass Normalizer(BaseEstimator, TransformerMixin):    """Normalize samples individually to unit norm.    Each sample (i.e. each row of the data matrix) with at least one    non zero component is rescaled independently of other samples so    that its norm (l1 or l2) equals one.    This transformer is able to work both with dense numpy arrays and    scipy.sparse matrix (use CSR format if you want to avoid the burden of    a copy / conversion).    Scaling inputs to unit norms is a common operation for text    classification or clustering for instance. For instance the dot    product of two l2-normalized TF-IDF vectors is the cosine similarity    of the vectors and is the base similarity metric for the Vector    Space Model commonly used by the Information Retrieval community.    Parameters    ----------    norm : 'l1' or 'l2', optional ('l2' by default)        The norm to use to normalize each non zero sample.    copy : boolean, optional, default True        set to False to perform inplace row normalization and avoid a        copy (if the input is already a numpy array or a scipy.sparse        CSR matrix).    Notes    -----    This estimator is stateless (besides constructor parameters), the    fit method does nothing but is useful when used in a pipeline.    See also    --------    :func:`sklearn.preprocessing.normalize` equivalent function    without the object oriented API    """    def __init__(self, norm='l2', copy=True):        self.norm = norm        self.copy = copy    def fit(self, X, y=None):        """Do nothing and return the estimator unchanged        This method is just there to implement the usual API and hence        work in pipelines.        """        X = check_array(X, accept_sparse='csr')        return self    def transform(self, X, y=None, copy=None):        """Scale each non zero row of X to unit norm        Parameters        ----------        X : array or scipy.sparse matrix with shape [n_samples, n_features]            The data to normalize, row by row. scipy.sparse matrices should be            in CSR format to avoid an un-necessary copy.        """        copy = copy if copy is not None else self.copy        X = check_array(X, accept_sparse='csr')        return normalize(X, norm=self.norm, axis=1, copy=copy)def binarize(X, threshold=0.0, copy=True):    """Boolean thresholding of array-like or scipy.sparse matrix    Parameters    ----------    X : array or scipy.sparse matrix with shape [n_samples, n_features]        The data to binarize, element by element.        scipy.sparse matrices should be in CSR or CSC format to avoid an        un-necessary copy.    threshold : float, optional (0.0 by default)        Feature values below or equal to this are replaced by 0, above it by 1.        Threshold may not be less than 0 for operations on sparse matrices.    copy : boolean, optional, default True        set to False to perform inplace binarization and avoid a copy        (if the input is already a numpy array or a scipy.sparse CSR / CSC        matrix and if axis is 1).    See also    --------    :class:`sklearn.preprocessing.Binarizer` to perform binarization    using the ``Transformer`` API (e.g. as part of a preprocessing    :class:`sklearn.pipeline.Pipeline`)    """    X = check_array(X, accept_sparse=['csr', 'csc'], copy=copy)    if sparse.issparse(X):        if threshold < 0:            raise ValueError('Cannot binarize a sparse matrix with threshold '                             '< 0')        cond = X.data > threshold        not_cond = np.logical_not(cond)        X.data[cond] = 1        X.data[not_cond] = 0        X.eliminate_zeros()    else:        cond = X > threshold        not_cond = np.logical_not(cond)        X[cond] = 1        X[not_cond] = 0    return Xclass Binarizer(BaseEstimator, TransformerMixin):    """Binarize data (set feature values to 0 or 1) according to a threshold    Values greater than the threshold map to 1, while values less than    or equal to the threshold map to 0. With the default threshold of 0,    only positive values map to 1.    Binarization is a common operation on text count data where the    analyst can decide to only consider the presence or absence of a    feature rather than a quantified number of occurrences for instance.    It can also be used as a pre-processing step for estimators that    consider boolean random variables (e.g. modelled using the Bernoulli    distribution in a Bayesian setting).    Parameters    ----------    threshold : float, optional (0.0 by default)        Feature values below or equal to this are replaced by 0, above it by 1.        Threshold may not be less than 0 for operations on sparse matrices.    copy : boolean, optional, default True        set to False to perform inplace binarization and avoid a copy (if        the input is already a numpy array or a scipy.sparse CSR matrix).    Notes    -----    If the input is a sparse matrix, only the non-zero values are subject    to update by the Binarizer class.    This estimator is stateless (besides constructor parameters), the    fit method does nothing but is useful when used in a pipeline.    """    def __init__(self, threshold=0.0, copy=True):        self.threshold = threshold        self.copy = copy    def fit(self, X, y=None):        """Do nothing and return the estimator unchanged        This method is just there to implement the usual API and hence        work in pipelines.        """        check_array(X, accept_sparse='csr')        return self    def transform(self, X, y=None, copy=None):        """Binarize each element of X        Parameters        ----------        X : array or scipy.sparse matrix with shape [n_samples, n_features]            The data to binarize, element by element.            scipy.sparse matrices should be in CSR format to avoid an            un-necessary copy.        """        copy = copy if copy is not None else self.copy        return binarize(X, threshold=self.threshold, copy=copy)class KernelCenterer(BaseEstimator, TransformerMixin):    """Center a kernel matrix    Let K(x, z) be a kernel defined by phi(x)^T phi(z), where phi is a    function mapping x to a Hilbert space. KernelCenterer centers (i.e.,    normalize to have zero mean) the data without explicitly computing phi(x).    It is equivalent to centering phi(x) with    sklearn.preprocessing.StandardScaler(with_std=False).    """    def fit(self, K, y=None):        """Fit KernelCenterer        Parameters        ----------        K : numpy array of shape [n_samples, n_samples]            Kernel matrix.        Returns        -------        self : returns an instance of self.        """        K = check_array(K)        n_samples = K.shape[0]        self.K_fit_rows_ = np.sum(K, axis=0) / n_samples        self.K_fit_all_ = self.K_fit_rows_.sum() / n_samples        return self    def transform(self, K, y=None, copy=True):        """Center kernel matrix.        Parameters        ----------        K : numpy array of shape [n_samples1, n_samples2]            Kernel matrix.        copy : boolean, optional, default True            Set to False to perform inplace computation.        Returns        -------        K_new : numpy array of shape [n_samples1, n_samples2]        """        check_is_fitted(self, 'K_fit_all_')        K = check_array(K)        if copy:            K = K.copy()        K_pred_cols = (np.sum(K, axis=1) /                       self.K_fit_rows_.shape[0])[:, np.newaxis]        K -= self.K_fit_rows_        K -= K_pred_cols        K += self.K_fit_all_        return Kdef add_dummy_feature(X, value=1.0):    """Augment dataset with an additional dummy feature.    This is useful for fitting an intercept term with implementations which    cannot otherwise fit it directly.    Parameters    ----------    X : array or scipy.sparse matrix with shape [n_samples, n_features]        Data.    value : float        Value to use for the dummy feature.    Returns    -------    X : array or scipy.sparse matrix with shape [n_samples, n_features + 1]        Same data with dummy feature added as first column.    Examples    --------    >>> from sklearn.preprocessing import add_dummy_feature    >>> add_dummy_feature([[0, 1], [1, 0]])    array([[ 1.,  0.,  1.],           [ 1.,  1.,  0.]])    """    X = check_array(X, accept_sparse=['csc', 'csr', 'coo'])    n_samples, n_features = X.shape    shape = (n_samples, n_features + 1)    if sparse.issparse(X):        if sparse.isspmatrix_coo(X):            # Shift columns to the right.            col = X.col + 1            # Column indices of dummy feature are 0 everywhere.            col = np.concatenate((np.zeros(n_samples), col))            # Row indices of dummy feature are 0, ..., n_samples-1.            row = np.concatenate((np.arange(n_samples), X.row))            # Prepend the dummy feature n_samples times.            data = np.concatenate((np.ones(n_samples) * value, X.data))            return sparse.coo_matrix((data, (row, col)), shape)        elif sparse.isspmatrix_csc(X):            # Shift index pointers since we need to add n_samples elements.            indptr = X.indptr + n_samples            # indptr[0] must be 0.            indptr = np.concatenate((np.array([0]), indptr))            # Row indices of dummy feature are 0, ..., n_samples-1.            indices = np.concatenate((np.arange(n_samples), X.indices))            # Prepend the dummy feature n_samples times.            data = np.concatenate((np.ones(n_samples) * value, X.data))            return sparse.csc_matrix((data, indices, indptr), shape)        else:            klass = X.__class__            return klass(add_dummy_feature(X.tocoo(), value))    else:        return np.hstack((np.ones((n_samples, 1)) * value, X))def _transform_selected(X, transform, selected="all", copy=True):    """Apply a transform function to portion of selected features    Parameters    ----------    X : array-like or sparse matrix, shape=(n_samples, n_features)        Dense array or sparse matrix.    transform : callable        A callable transform(X) -> X_transformed    copy : boolean, optional        Copy X even if it could be avoided.    selected: "all" or array of indices or mask        Specify which features to apply the transform to.    Returns    -------    X : array or sparse matrix, shape=(n_samples, n_features_new)    """    if selected == "all":        return transform(X)    X = check_array(X, accept_sparse='csc', copy=copy)    if len(selected) == 0:        return X    n_features = X.shape[1]    ind = np.arange(n_features)    sel = np.zeros(n_features, dtype=bool)    sel[np.asarray(selected)] = True    not_sel = np.logical_not(sel)    n_selected = np.sum(sel)    if n_selected == 0:        # No features selected.        return X    elif n_selected == n_features:        # All features selected.        return transform(X)    else:        X_sel = transform(X[:, ind[sel]])        X_not_sel = X[:, ind[not_sel]]        if sparse.issparse(X_sel) or sparse.issparse(X_not_sel):            return sparse.hstack((X_sel, X_not_sel))        else:            return np.hstack((X_sel, X_not_sel))class OneHotEncoder(BaseEstimator, TransformerMixin):    """Encode categorical integer features using a one-hot aka one-of-K scheme.    The input to this transformer should be a matrix of integers, denoting    the values taken on by categorical (discrete) features. The output will be    a sparse matrix where each column corresponds to one possible value of one    feature. It is assumed that input features take on values in the range    [0, n_values).    This encoding is needed for feeding categorical data to many scikit-learn    estimators, notably linear models and SVMs with the standard kernels.    Parameters    ----------    n_values : 'auto', int or array of ints        Number of values per feature.        - 'auto' : determine value range from training data.        - int : maximum value for all features.        - array : maximum value per feature.    categorical_features: "all" or array of indices or mask        Specify what features are treated as categorical.        - 'all' (default): All features are treated as categorical.        - array of indices: Array of categorical feature indices.        - mask: Array of length n_features and with dtype=bool.        Non-categorical features are always stacked to the right of the matrix.    dtype : number type, default=np.float        Desired dtype of output.    sparse : boolean, default=True        Will return sparse matrix if set True else will return an array.    handle_unknown : str, 'error' or 'ignore'        Whether to raise an error or ignore if a unknown categorical feature is        present during transform.    Attributes    ----------    active_features_ : array        Indices for active features, meaning values that actually occur        in the training set. Only available when n_values is ``'auto'``.    feature_indices_ : array of shape (n_features,)        Indices to feature ranges.        Feature ``i`` in the original data is mapped to features        from ``feature_indices_[i]`` to ``feature_indices_[i+1]``        (and then potentially masked by `active_features_` afterwards)    n_values_ : array of shape (n_features,)        Maximum number of values per feature.    Examples    --------    Given a dataset with three features and two samples, we let the encoder    find the maximum value per feature and transform the data to a binary    one-hot encoding.    >>> from sklearn.preprocessing import OneHotEncoder    >>> enc = OneHotEncoder()    >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], \[1, 0, 2]])  # doctest: +ELLIPSIS    OneHotEncoder(categorical_features='all', dtype=<... 'float'>,           handle_unknown='error', n_values='auto', sparse=True)    >>> enc.n_values_    array([2, 3, 4])    >>> enc.feature_indices_    array([0, 2, 5, 9])    >>> enc.transform([[0, 1, 1]]).toarray()    array([[ 1.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.]])    See also    --------    sklearn.feature_extraction.DictVectorizer : performs a one-hot encoding of      dictionary items (also handles string-valued features).    sklearn.feature_extraction.FeatureHasher : performs an approximate one-hot      encoding of dictionary items or strings.    """    def __init__(self, n_values="auto", categorical_features="all",                 dtype=np.float, sparse=True, handle_unknown='error'):        self.n_values = n_values        self.categorical_features = categorical_features        self.dtype = dtype        self.sparse = sparse        self.handle_unknown = handle_unknown    def fit(self, X, y=None):        """Fit OneHotEncoder to X.        Parameters        ----------        X : array-like, shape=(n_samples, n_feature)            Input array of type int.        Returns        -------        self        """        self.fit_transform(X)        return self    def _fit_transform(self, X):        """Assumes X contains only categorical features."""        X = check_array(X, dtype=np.int)        if np.any(X < 0):            raise ValueError("X needs to contain only non-negative integers.")        n_samples, n_features = X.shape        if self.n_values == 'auto':            n_values = np.max(X, axis=0) + 1        elif isinstance(self.n_values, numbers.Integral):            if (np.max(X, axis=0) >= self.n_values).any():                raise ValueError("Feature out of bounds for n_values=%d"                                 % self.n_values)            n_values = np.empty(n_features, dtype=np.int)            n_values.fill(self.n_values)        else:            try:                n_values = np.asarray(self.n_values, dtype=int)            except (ValueError, TypeError):                raise TypeError("Wrong type for parameter `n_values`. Expected"                                " 'auto', int or array of ints, got %r"                                % type(X))            if n_values.ndim < 1 or n_values.shape[0] != X.shape[1]:                raise ValueError("Shape mismatch: if n_values is an array,"                                 " it has to be of shape (n_features,).")        self.n_values_ = n_values        n_values = np.hstack([[0], n_values])        indices = np.cumsum(n_values)        self.feature_indices_ = indices        column_indices = (X + indices[:-1]).ravel()        row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),                                n_features)        data = np.ones(n_samples * n_features)        out = sparse.coo_matrix((data, (row_indices, column_indices)),                                shape=(n_samples, indices[-1]),                                dtype=self.dtype).tocsr()        if self.n_values == 'auto':            mask = np.array(out.sum(axis=0)).ravel() != 0            active_features = np.where(mask)[0]            out = out[:, active_features]            self.active_features_ = active_features        return out if self.sparse else out.toarray()    def fit_transform(self, X, y=None):        """Fit OneHotEncoder to X, then transform X.        Equivalent to self.fit(X).transform(X), but more convenient and more        efficient. See fit for the parameters, transform for the return value.        """        return _transform_selected(X, self._fit_transform,                                   self.categorical_features, copy=True)    def _transform(self, X):        """Assumes X contains only categorical features."""        X = check_array(X, dtype=np.int)        if np.any(X < 0):            raise ValueError("X needs to contain only non-negative integers.")        n_samples, n_features = X.shape        indices = self.feature_indices_        if n_features != indices.shape[0] - 1:            raise ValueError("X has different shape than during fitting."                             " Expected %d, got %d."                             % (indices.shape[0] - 1, n_features))        # We use only those catgorical features of X that are known using fit.        # i.e lesser than n_values_ using mask.        # This means, if self.handle_unknown is "ignore", the row_indices and        # col_indices corresponding to the unknown categorical feature are        # ignored.        mask = (X < self.n_values_).ravel()        if np.any(~mask):            if self.handle_unknown not in ['error', 'ignore']:                raise ValueError("handle_unknown should be either error or "                                 "unknown got %s" % self.handle_unknown)            if self.handle_unknown == 'error':                raise ValueError("unknown categorical feature present %s "                                 "during transform." % X[~mask])        column_indices = (X + indices[:-1]).ravel()[mask]        row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),                                n_features)[mask]        data = np.ones(np.sum(mask))        out = sparse.coo_matrix((data, (row_indices, column_indices)),                                shape=(n_samples, indices[-1]),                                dtype=self.dtype).tocsr()        if self.n_values == 'auto':            out = out[:, self.active_features_]        return out if self.sparse else out.toarray()    def transform(self, X):        """Transform X using one-hot encoding.        Parameters        ----------        X : array-like, shape=(n_samples, n_features)            Input array of type int.        Returns        -------        X_out : sparse matrix if sparse=True else a 2-d array, dtype=int            Transformed input.        """        return _transform_selected(X, self._transform,                                   self.categorical_features, copy=True)

imputation.py

# Authors: Nicolas Tresegnie <nicolas.tresegnie@gmail.com># License: BSD 3 clauseimport warningsimport numpy as npimport numpy.ma as mafrom scipy import sparsefrom scipy import statsfrom ..base import BaseEstimator, TransformerMixinfrom ..utils import array2dfrom ..utils import atleast2d_or_csrfrom ..utils import atleast2d_or_cscfrom ..utils import as_float_arrayfrom ..utils.fixes import astypefrom ..externals import sixzip = six.moves.zipmap = six.moves.map__all__ = [    'Imputer',]def _get_mask(X, value_to_mask):    """Compute the boolean mask X == missing_values."""    if value_to_mask == "NaN" or np.isnan(value_to_mask):        return np.isnan(X)    else:        return X == value_to_maskdef _get_median(data, n_zeros):    """Compute the median of data with n_zeros additional zeros.    This function is used to support sparse matrices; it modifies data in-place    """     n_elems = len(data) + n_zeros    if not n_elems:        return np.nan    n_negative = np.count_nonzero(data < 0)    middle, is_odd = divmod(n_elems, 2)    data.sort()    if is_odd:        return _get_elem_at_rank(middle, data, n_negative, n_zeros)    return (_get_elem_at_rank(middle - 1, data, n_negative, n_zeros) +            _get_elem_at_rank(middle, data, n_negative, n_zeros)) / 2.def _get_elem_at_rank(rank, data, n_negative, n_zeros):    """Find the value in data augmented with n_zeros for the given rank"""    if rank < n_negative:        return data[rank]    if rank - n_negative < n_zeros:        return 0    return data[rank - n_zeros]def _most_frequent(array, extra_value, n_repeat):    """Compute the most frequent value in a 1d array extended with       [extra_value] * n_repeat, where extra_value is assumed to be not part       of the array."""    # Compute the most frequent value in array only    if array.size > 0:        mode = stats.mode(array)        most_frequent_value = mode[0][0]        most_frequent_count = mode[1][0]    else:        most_frequent_value = 0        most_frequent_count = 0    # Compare to array + [extra_value] * n_repeat    if most_frequent_count == 0 and n_repeat == 0:        return np.nan    elif most_frequent_count < n_repeat:        return extra_value    elif most_frequent_count > n_repeat:        return most_frequent_value    elif most_frequent_count == n_repeat:        # Ties the breaks. Copy the behaviour of scipy.stats.mode        if most_frequent_value < extra_value:            return most_frequent_value        else:            return extra_valueclass Imputer(BaseEstimator, TransformerMixin):    """Imputation transformer for completing missing values.    Parameters    ----------    missing_values : integer or "NaN", optional (default="NaN")        The placeholder for the missing values. All occurrences of        `missing_values` will be imputed. For missing values encoded as np.nan,        use the string value "NaN".    strategy : string, optional (default="mean")        The imputation strategy.        - If "mean", then replace missing values using the mean along          the axis.        - If "median", then replace missing values using the median along          the axis.        - If "most_frequent", then replace missing using the most frequent          value along the axis.    axis : integer, optional (default=0)        The axis along which to impute.        - If `axis=0`, then impute along columns.        - If `axis=1`, then impute along rows.    verbose : integer, optional (default=0)        Controls the verbosity of the imputer.    copy : boolean, optional (default=True)        If True, a copy of X will be created. If False, imputation will        be done in-place whenever possible. Note that, in the following cases,        a new copy will always be made, even if `copy=False`:        - If X is not an array of floating values;        - If X is sparse and `missing_values=0`;        - If `axis=0` and X is encoded as a CSR matrix;        - If `axis=1` and X is encoded as a CSC matrix.    Attributes    ----------    `statistics_` : array of shape (n_features,)        The imputation fill value for each feature if axis == 0.    Notes    -----    - When ``axis=0``, columns which only contained missing values at `fit`      are discarded upon `transform`.    - When ``axis=1``, an exception is raised if there are rows for which it is      not possible to fill in the missing values (e.g., because they only      contain missing values).    """    def __init__(self, missing_values="NaN", strategy="mean",                 axis=0, verbose=0, copy=True):        self.missing_values = missing_values        self.strategy = strategy        self.axis = axis        self.verbose = verbose        self.copy = copy    def fit(self, X, y=None):        """Fit the imputer on X.        Parameters        ----------        X : {array-like, sparse matrix}, shape (n_samples, n_features)            Input data, where ``n_samples`` is the number of samples and            ``n_features`` is the number of features.        Returns        -------        self : object            Returns self.        """        # Check parameters        allowed_strategies = ["mean", "median", "most_frequent"]        if self.strategy not in allowed_strategies:            raise ValueError("Can only use these strategies: {0} "                             " got strategy={1}".format(allowed_strategies,                                                        self.strategy))        if self.axis not in [0, 1]:            raise ValueError("Can only impute missing values on axis 0 and 1, "                             " got axis={0}".format(self.axis))        # Since two different arrays can be provided in fit(X) and        # transform(X), the imputation data will be computed in transform()        # when the imputation is done per sample (i.e., when axis=1).        if self.axis == 0:            X = atleast2d_or_csc(X, dtype=np.float64, force_all_finite=False)            if sparse.issparse(X):                self.statistics_ = self._sparse_fit(X,                                                    self.strategy,                                                    self.missing_values,                                                    self.axis)            else:                self.statistics_ = self._dense_fit(X,                                                   self.strategy,                                                   self.missing_values,                                                   self.axis)        return self    def _sparse_fit(self, X, strategy, missing_values, axis):        """Fit the transformer on sparse data."""        # Imputation is done "by column", so if we want to do it        # by row we only need to convert the matrix to csr format.        if axis == 1:            X = X.tocsr()        else:            X = X.tocsc()        # Count the zeros        if missing_values == 0:            n_zeros_axis = np.zeros(X.shape[not axis], dtype=int)        else:            n_zeros_axis = X.shape[axis] - np.diff(X.indptr)        # Mean        if strategy == "mean":            if missing_values != 0:                n_non_missing = n_zeros_axis                # Mask the missing elements                mask_missing_values = _get_mask(X.data, missing_values)                mask_valids = np.logical_not(mask_missing_values)                # Sum only the valid elements                new_data = X.data.copy()                new_data[mask_missing_values] = 0                X = sparse.csc_matrix((new_data, X.indices, X.indptr),                                      copy=False)                sums = X.sum(axis=0)                # Count the elements != 0                mask_non_zeros = sparse.csc_matrix(                    (mask_valids.astype(np.float64),                     X.indices,                     X.indptr), copy=False)                s = mask_non_zeros.sum(axis=0)                n_non_missing = np.add(n_non_missing, s)            else:                sums = X.sum(axis=axis)                n_non_missing = np.diff(X.indptr)            # Ignore the error, columns with a np.nan statistics_            # are not an error at this point. These columns will            # be removed in transform            with np.errstate(all="ignore"):                return np.ravel(sums) / np.ravel(n_non_missing)        # Median + Most frequent        else:            # Remove the missing values, for each column            columns_all = np.hsplit(X.data, X.indptr[1:-1])            mask_missing_values = _get_mask(X.data, missing_values)            mask_valids = np.hsplit(np.logical_not(mask_missing_values),                                    X.indptr[1:-1])            # astype necessary for bug in numpy.hsplit before v1.9            columns = [col[astype(mask, bool, copy=False)]                       for col, mask in zip(columns_all, mask_valids)]            # Median            if strategy == "median":                median = np.empty(len(columns))                for i, column in enumerate(columns):                    median[i] = _get_median(column, n_zeros_axis[i])                return median            # Most frequent            elif strategy == "most_frequent":                most_frequent = np.empty(len(columns))                for i, column in enumerate(columns):                    most_frequent[i] = _most_frequent(column,                                                      0,                                                      n_zeros_axis[i])                return most_frequent    def _dense_fit(self, X, strategy, missing_values, axis):        """Fit the transformer on dense data."""        X = array2d(X, force_all_finite=False)        mask = _get_mask(X, missing_values)        masked_X = ma.masked_array(X, mask=mask)        # Mean        if strategy == "mean":            mean_masked = np.ma.mean(masked_X, axis=axis)            # Avoid the warning "Warning: converting a masked element to nan."            mean = np.ma.getdata(mean_masked)            mean[np.ma.getmask(mean_masked)] = np.nan            return mean        # Median        elif strategy == "median":            if tuple(int(v) for v in np.__version__.split('.')[:2]) < (1, 5):                # In old versions of numpy, calling a median on an array                # containing nans returns nan. This is different is                # recent versions of numpy, which we want to mimic                masked_X.mask = np.logical_or(masked_X.mask,                                              np.isnan(X))            median_masked = np.ma.median(masked_X, axis=axis)            # Avoid the warning "Warning: converting a masked element to nan."            median = np.ma.getdata(median_masked)            median[np.ma.getmaskarray(median_masked)] = np.nan            return median        # Most frequent        elif strategy == "most_frequent":            # scipy.stats.mstats.mode cannot be used because it will no work            # properly if the first element is masked and if it's frequency            # is equal to the frequency of the most frequent valid element            # See https://github.com/scipy/scipy/issues/2636            # To be able access the elements by columns            if axis == 0:                X = X.transpose()                mask = mask.transpose()            most_frequent = np.empty(X.shape[0])            for i, (row, row_mask) in enumerate(zip(X[:], mask[:])):                row_mask = np.logical_not(row_mask).astype(np.bool)                row = row[row_mask]                most_frequent[i] = _most_frequent(row, np.nan, 0)            return most_frequent    def transform(self, X):        """Impute all missing values in X.        Parameters        ----------        X : {array-like, sparse matrix}, shape = [n_samples, n_features]            The input data to complete.        """        # Copy just once        X = as_float_array(X, copy=self.copy, force_all_finite=False)        # Since two different arrays can be provided in fit(X) and        # transform(X), the imputation data need to be recomputed        # when the imputation is done per sample        if self.axis == 1:            X = atleast2d_or_csr(X, force_all_finite=False, copy=False)            if sparse.issparse(X):                statistics = self._sparse_fit(X,                                              self.strategy,                                              self.missing_values,                                              self.axis)            else:                statistics = self._dense_fit(X,                                             self.strategy,                                             self.missing_values,                                             self.axis)        else:            X = atleast2d_or_csc(X, force_all_finite=False, copy=False)            statistics = self.statistics_        # Delete the invalid rows/columns        invalid_mask = np.isnan(statistics)        valid_mask = np.logical_not(invalid_mask)        valid_statistics = statistics[valid_mask]        valid_statistics_indexes = np.where(valid_mask)[0]        missing = np.arange(X.shape[not self.axis])[invalid_mask]        if self.axis == 0 and invalid_mask.any():            if self.verbose:                warnings.warn("Deleting features without "                              "observed values: %s" % missing)            X = X[:, valid_statistics_indexes]        elif self.axis == 1 and invalid_mask.any():            raise ValueError("Some rows only contain "                             "missing values: %s" % missing)        # Do actual imputation        if sparse.issparse(X) and self.missing_values != 0:            mask = _get_mask(X.data, self.missing_values)            indexes = np.repeat(np.arange(len(X.indptr) - 1, dtype=np.int),                                np.diff(X.indptr))[mask]            X.data[mask] = valid_statistics[indexes].astype(X.dtype)        else:            if sparse.issparse(X):                X = X.toarray()            mask = _get_mask(X, self.missing_values)            n_missing = np.sum(mask, axis=self.axis)            values = np.repeat(valid_statistics, n_missing)            if self.axis == 0:                coordinates = np.where(mask.transpose())[::-1]            else:                coordinates = mask            X[coordinates] = values        return X

label.py

# Authors: Alexandre Gramfort <alexandre.gramfort@inria.fr>#          Mathieu Blondel <mathieu@mblondel.org>#          Olivier Grisel <olivier.grisel@ensta.org>#          Andreas Mueller <amueller@ais.uni-bonn.de>#          Joel Nothman <joel.nothman@gmail.com>#          Hamzeh Alsalhi <ha258@cornell.edu># License: BSD 3 clausefrom collections import defaultdictimport itertoolsimport arrayimport warningsimport numpy as npimport scipy.sparse as spfrom ..base import BaseEstimator, TransformerMixinfrom ..utils.fixes import np_versionfrom ..utils.fixes import sparse_min_maxfrom ..utils.fixes import astypefrom ..utils.fixes import in1dfrom ..utils import deprecated, column_or_1dfrom ..utils.validation import check_arrayfrom ..utils.validation import _num_samplesfrom ..utils.multiclass import unique_labelsfrom ..utils.multiclass import type_of_targetfrom ..externals import sixzip = six.moves.zipmap = six.moves.map__all__ = [    'label_binarize',    'LabelBinarizer',    'LabelEncoder',    'MultiLabelBinarizer',]def _check_numpy_unicode_bug(labels):    """Check that user is not subject to an old numpy bug    Fixed in master before 1.7.0:      https://github.com/numpy/numpy/pull/243    """    if np_version[:3] < (1, 7, 0) and labels.dtype.kind == 'U':        raise RuntimeError("NumPy < 1.7.0 does not implement searchsorted"                           " on unicode data correctly. Please upgrade"                           " NumPy to use LabelEncoder with unicode inputs.")class LabelEncoder(BaseEstimator, TransformerMixin):    """Encode labels with value between 0 and n_classes-1.    Attributes    ----------    classes_ : array of shape (n_class,)        Holds the label for each class.    Examples    --------    `LabelEncoder` can be used to normalize labels.    >>> from sklearn import preprocessing    >>> le = preprocessing.LabelEncoder()    >>> le.fit([1, 2, 2, 6])    LabelEncoder()    >>> le.classes_    array([1, 2, 6])    >>> le.transform([1, 1, 2, 6]) #doctest: +ELLIPSIS    array([0, 0, 1, 2]...)    >>> le.inverse_transform([0, 0, 1, 2])    array([1, 1, 2, 6])    It can also be used to transform non-numerical labels (as long as they are    hashable and comparable) to numerical labels.    >>> le = preprocessing.LabelEncoder()    >>> le.fit(["paris", "paris", "tokyo", "amsterdam"])    LabelEncoder()    >>> list(le.classes_)    ['amsterdam', 'paris', 'tokyo']    >>> le.transform(["tokyo", "tokyo", "paris"]) #doctest: +ELLIPSIS    array([2, 2, 1]...)    >>> list(le.inverse_transform([2, 2, 1]))    ['tokyo', 'tokyo', 'paris']    """    def _check_fitted(self):        if not hasattr(self, "classes_"):            raise ValueError("LabelEncoder was not fitted yet.")    def fit(self, y):        """Fit label encoder        Parameters        ----------        y : array-like of shape (n_samples,)            Target values.        Returns        -------        self : returns an instance of self.        """        y = column_or_1d(y, warn=True)        _check_numpy_unicode_bug(y)        self.classes_ = np.unique(y)        return self    def fit_transform(self, y):        """Fit label encoder and return encoded labels        Parameters        ----------        y : array-like of shape [n_samples]            Target values.        Returns        -------        y : array-like of shape [n_samples]        """        y = column_or_1d(y, warn=True)        _check_numpy_unicode_bug(y)        self.classes_, y = np.unique(y, return_inverse=True)        return y    def transform(self, y):        """Transform labels to normalized encoding.        Parameters        ----------        y : array-like of shape [n_samples]            Target values.        Returns        -------        y : array-like of shape [n_samples]        """        self._check_fitted()        classes = np.unique(y)        _check_numpy_unicode_bug(classes)        if len(np.intersect1d(classes, self.classes_)) < len(classes):            diff = np.setdiff1d(classes, self.classes_)            raise ValueError("y contains new labels: %s" % str(diff))        return np.searchsorted(self.classes_, y)    def inverse_transform(self, y):        """Transform labels back to original encoding.        Parameters        ----------        y : numpy array of shape [n_samples]            Target values.        Returns        -------        y : numpy array of shape [n_samples]        """        self._check_fitted()        y = np.asarray(y)        return self.classes_[y]class LabelBinarizer(BaseEstimator, TransformerMixin):    """Binarize labels in a one-vs-all fashion    Several regression and binary classification algorithms are    available in the scikit. A simple way to extend these algorithms    to the multi-class classification case is to use the so-called    one-vs-all scheme.    At learning time, this simply consists in learning one regressor    or binary classifier per class. In doing so, one needs to convert    multi-class labels to binary labels (belong or does not belong    to the class). LabelBinarizer makes this process easy with the    transform method.    At prediction time, one assigns the class for which the corresponding    model gave the greatest confidence. LabelBinarizer makes this easy    with the inverse_transform method.    Parameters    ----------    neg_label : int (default: 0)        Value with which negative labels must be encoded.    pos_label : int (default: 1)        Value with which positive labels must be encoded.    sparse_output : boolean (default: False)        True if the returned array from transform is desired to be in sparse        CSR format.    Attributes    ----------    classes_ : array of shape [n_class]        Holds the label for each class.    y_type_ : str,        Represents the type of the target data as evaluated by        utils.multiclass.type_of_target. Possible type are 'continuous',        'continuous-multioutput', 'binary', 'multiclass',        'mutliclass-multioutput', 'multilabel-sequences',        'multilabel-indicator', and 'unknown'.    multilabel_ : boolean        True if the transformer was fitted on a multilabel rather than a        multiclass set of labels. The ``multilabel_`` attribute is deprecated        and will be removed in 0.18    sparse_input_ : boolean,        True if the input data to transform is given as a sparse matrix, False        otherwise.    indicator_matrix_ : str        'sparse' when the input data to tansform is a multilable-indicator and        is sparse, None otherwise. The ``indicator_matrix_`` attribute is        deprecated as of version 0.16 and will be removed in 0.18    Examples    --------    >>> from sklearn import preprocessing    >>> lb = preprocessing.LabelBinarizer()    >>> lb.fit([1, 2, 6, 4, 2])    LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)    >>> lb.classes_    array([1, 2, 4, 6])    >>> lb.transform([1, 6])    array([[1, 0, 0, 0],           [0, 0, 0, 1]])    Binary targets transform to a column vector    >>> lb = preprocessing.LabelBinarizer()    >>> lb.fit_transform(['yes', 'no', 'no', 'yes'])    array([[1],           [0],           [0],           [1]])    Passing a 2D matrix for multilabel classification    >>> import numpy as np    >>> lb.fit(np.array([[0, 1, 1], [1, 0, 0]]))    LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)    >>> lb.classes_    array([0, 1, 2])    >>> lb.transform([0, 1, 2, 1])    array([[1, 0, 0],           [0, 1, 0],           [0, 0, 1],           [0, 1, 0]])    See also    --------    label_binarize : function to perform the transform operation of        LabelBinarizer with fixed classes.    """    def __init__(self, neg_label=0, pos_label=1, sparse_output=False):        if neg_label >= pos_label:            raise ValueError("neg_label={0} must be strictly less than "                             "pos_label={1}.".format(neg_label, pos_label))        if sparse_output and (pos_label == 0 or neg_label != 0):            raise ValueError("Sparse binarization is only supported with non "                             "zero pos_label and zero neg_label, got "                             "pos_label={0} and neg_label={1}"                             "".format(pos_label, neg_label))        self.neg_label = neg_label        self.pos_label = pos_label        self.sparse_output = sparse_output    @property    @deprecated("Attribute ``indicator_matrix_`` is deprecated and will be "                "removed in 0.17. Use ``y_type_ == 'multilabel-indicator'`` "                "instead")    def indicator_matrix_(self):        return self.y_type_ == 'multilabel-indicator'    @property    @deprecated("Attribute ``multilabel_`` is deprecated and will be removed "                "in 0.17. Use ``y_type_.startswith('multilabel')`` "                "instead")    def multilabel_(self):        return self.y_type_.startswith('multilabel')    def _check_fitted(self):        if not hasattr(self, "classes_"):            raise ValueError("LabelBinarizer was not fitted yet.")    def fit(self, y):        """Fit label binarizer        Parameters        ----------        y : numpy array of shape (n_samples,) or (n_samples, n_classes)            Target values. The 2-d matrix should only contain 0 and 1,            represents multilabel classification.        Returns        -------        self : returns an instance of self.        """        self.y_type_ = type_of_target(y)        if 'multioutput' in self.y_type_:            raise ValueError("Multioutput target data is not supported with "                             "label binarization")        if _num_samples(y) == 0:            raise ValueError('y has 0 samples: %r' % y)        self.sparse_input_ = sp.issparse(y)        self.classes_ = unique_labels(y)        return self    def transform(self, y):        """Transform multi-class labels to binary labels        The output of transform is sometimes referred to by some authors as the        1-of-K coding scheme.        Parameters        ----------        y : numpy array or sparse matrix of shape (n_samples,) or            (n_samples, n_classes) Target values. The 2-d matrix should only            contain 0 and 1, represents multilabel classification. Sparse            matrix can be CSR, CSC, COO, DOK, or LIL.        Returns        -------        Y : numpy array or CSR matrix of shape [n_samples, n_classes]            Shape will be [n_samples, 1] for binary problems.        """        self._check_fitted()        y_is_multilabel = type_of_target(y).startswith('multilabel')        if y_is_multilabel and not self.y_type_.startswith('multilabel'):            raise ValueError("The object was not fitted with multilabel"                             " input.")        return label_binarize(y, self.classes_,                              pos_label=self.pos_label,                              neg_label=self.neg_label,                              sparse_output=self.sparse_output)    def inverse_transform(self, Y, threshold=None):        """Transform binary labels back to multi-class labels        Parameters        ----------        Y : numpy array or sparse matrix with shape [n_samples, n_classes]            Target values. All sparse matrices are converted to CSR before            inverse transformation.        threshold : float or None            Threshold used in the binary and multi-label cases.            Use 0 when:                - Y contains the output of decision_function (classifier)            Use 0.5 when:                - Y contains the output of predict_proba            If None, the threshold is assumed to be half way between            neg_label and pos_label.        Returns        -------        y : numpy array or CSR matrix of shape [n_samples] Target values.        Notes        -----        In the case when the binary labels are fractional        (probabilistic), inverse_transform chooses the class with the        greatest value. Typically, this allows to use the output of a        linear model's decision_function method directly as the input        of inverse_transform.        """        self._check_fitted()        if threshold is None:            threshold = (self.pos_label + self.neg_label) / 2.        if self.y_type_ == "multiclass":            y_inv = _inverse_binarize_multiclass(Y, self.classes_)        else:            y_inv = _inverse_binarize_thresholding(Y, self.y_type_,                                                   self.classes_, threshold)        if self.sparse_input_:            y_inv = sp.csr_matrix(y_inv)        elif sp.issparse(y_inv):            y_inv = y_inv.toarray()        return y_invdef label_binarize(y, classes, neg_label=0, pos_label=1,                   sparse_output=False, multilabel=None):    """Binarize labels in a one-vs-all fashion    Several regression and binary classification algorithms are    available in the scikit. A simple way to extend these algorithms    to the multi-class classification case is to use the so-called    one-vs-all scheme.    This function makes it possible to compute this transformation for a    fixed set of class labels known ahead of time.    Parameters    ----------    y : array-like        Sequence of integer labels or multilabel data to encode.    classes : array-like of shape [n_classes]        Uniquely holds the label for each class.    neg_label : int (default: 0)        Value with which negative labels must be encoded.    pos_label : int (default: 1)        Value with which positive labels must be encoded.    sparse_output : boolean (default: False),        Set to true if output binary array is desired in CSR sparse format    Returns    -------    Y : numpy array or CSR matrix of shape [n_samples, n_classes]        Shape will be [n_samples, 1] for binary problems.    Examples    --------    >>> from sklearn.preprocessing import label_binarize    >>> label_binarize([1, 6], classes=[1, 2, 4, 6])    array([[1, 0, 0, 0],           [0, 0, 0, 1]])    The class ordering is preserved:    >>> label_binarize([1, 6], classes=[1, 6, 4, 2])    array([[1, 0, 0, 0],           [0, 1, 0, 0]])    Binary targets transform to a column vector    >>> label_binarize(['yes', 'no', 'no', 'yes'], classes=['no', 'yes'])    array([[1],           [0],           [0],           [1]])    See also    --------    LabelBinarizer : class used to wrap the functionality of label_binarize and        allow for fitting to classes independently of the transform operation    """    if not isinstance(y, list):        # XXX Workaround that will be removed when list of list format is        # dropped        y = check_array(y, accept_sparse='csr', ensure_2d=False, dtype=None)    else:        if _num_samples(y) == 0:            raise ValueError('y has 0 samples: %r' % y)    if neg_label >= pos_label:        raise ValueError("neg_label={0} must be strictly less than "                         "pos_label={1}.".format(neg_label, pos_label))    if (sparse_output and (pos_label == 0 or neg_label != 0)):        raise ValueError("Sparse binarization is only supported with non "                         "zero pos_label and zero neg_label, got "                         "pos_label={0} and neg_label={1}"                         "".format(pos_label, neg_label))    if multilabel is not None:        warnings.warn("The multilabel parameter is deprecated as of version "                      "0.15 and will be removed in 0.17. The parameter is no "                      "longer necessary because the value is automatically "                      "inferred.", DeprecationWarning)    # To account for pos_label == 0 in the dense case    pos_switch = pos_label == 0    if pos_switch:        pos_label = -neg_label    y_type = type_of_target(y)    if 'multioutput' in y_type:        raise ValueError("Multioutput target data is not supported with label "                         "binarization")    n_samples = y.shape[0] if sp.issparse(y) else len(y)    n_classes = len(classes)    classes = np.asarray(classes)    if y_type == "binary":        if len(classes) == 1:            Y = np.zeros((len(y), 1), dtype=np.int)            Y += neg_label            return Y        elif len(classes) >= 3:            y_type = "multiclass"    sorted_class = np.sort(classes)    if (y_type == "multilabel-indicator" and classes.size != y.shape[1]):        raise ValueError("classes {0} missmatch with the labels {1}"                         "found in the data".format(classes, unique_labels(y)))    if y_type in ("binary", "multiclass"):        y = column_or_1d(y)        # pick out the known labels from y        y_in_classes = in1d(y, classes)        y_seen = y[y_in_classes]        indices = np.searchsorted(sorted_class, y_seen)        indptr = np.hstack((0, np.cumsum(y_in_classes)))        data = np.empty_like(indices)        data.fill(pos_label)        Y = sp.csr_matrix((data, indices, indptr),                          shape=(n_samples, n_classes))    elif y_type == "multilabel-indicator":        Y = sp.csr_matrix(y)        if pos_label != 1:            data = np.empty_like(Y.data)            data.fill(pos_label)            Y.data = data    elif y_type == "multilabel-sequences":        Y = MultiLabelBinarizer(classes=classes,                                sparse_output=sparse_output).fit_transform(y)        if sp.issparse(Y):            Y.data[:] = pos_label        else:            Y[Y == 1] = pos_label        return Y    if not sparse_output:        Y = Y.toarray()        Y = astype(Y, int, copy=False)        if neg_label != 0:            Y[Y == 0] = neg_label        if pos_switch:            Y[Y == pos_label] = 0    else:        Y.data = astype(Y.data, int, copy=False)    # preserve label ordering    if np.any(classes != sorted_class):        indices = np.searchsorted(sorted_class, classes)        Y = Y[:, indices]    if y_type == "binary":        if sparse_output:            Y = Y.getcol(-1)        else:            Y = Y[:, -1].reshape((-1, 1))    return Ydef _inverse_binarize_multiclass(y, classes):    """Inverse label binarization transformation for multiclass.    Multiclass uses the maximal score instead of a threshold.    """    classes = np.asarray(classes)    if sp.issparse(y):        # Find the argmax for each row in y where y is a CSR matrix        y = y.tocsr()        n_samples, n_outputs = y.shape        outputs = np.arange(n_outputs)        row_max = sparse_min_max(y, 1)[1]        row_nnz = np.diff(y.indptr)        y_data_repeated_max = np.repeat(row_max, row_nnz)        # picks out all indices obtaining the maximum per row        y_i_all_argmax = np.flatnonzero(y_data_repeated_max == y.data)        # For corner case where last row has a max of 0        if row_max[-1] == 0:            y_i_all_argmax = np.append(y_i_all_argmax, [len(y.data)])        # Gets the index of the first argmax in each row from y_i_all_argmax        index_first_argmax = np.searchsorted(y_i_all_argmax, y.indptr[:-1])        # first argmax of each row        y_ind_ext = np.append(y.indices, [0])        y_i_argmax = y_ind_ext[y_i_all_argmax[index_first_argmax]]        # Handle rows of all 0        y_i_argmax[np.where(row_nnz == 0)[0]] = 0        # Handles rows with max of 0 that contain negative numbers        samples = np.arange(n_samples)[(row_nnz > 0) &                                       (row_max.ravel() == 0)]        for i in samples:            ind = y.indices[y.indptr[i]:y.indptr[i + 1]]            y_i_argmax[i] = classes[np.setdiff1d(outputs, ind)][0]        return classes[y_i_argmax]    else:        return classes.take(y.argmax(axis=1), mode="clip")def _inverse_binarize_thresholding(y, output_type, classes, threshold):    """Inverse label binarization transformation using thresholding."""    if output_type == "binary" and y.ndim == 2 and y.shape[1] > 2:        raise ValueError("output_type='binary', but y.shape = {0}".                         format(y.shape))    if output_type != "binary" and y.shape[1] != len(classes):        raise ValueError("The number of class is not equal to the number of "                         "dimension of y.")    classes = np.asarray(classes)    # Perform thresholding    if sp.issparse(y):        if threshold > 0:            if y.format not in ('csr', 'csc'):                y = y.tocsr()            y.data = np.array(y.data > threshold, dtype=np.int)            y.eliminate_zeros()        else:            y = np.array(y.toarray() > threshold, dtype=np.int)    else:        y = np.array(y > threshold, dtype=np.int)    # Inverse transform data    if output_type == "binary":        if sp.issparse(y):            y = y.toarray()        if y.ndim == 2 and y.shape[1] == 2:            return classes[y[:, 1]]        else:            if len(classes) == 1:                y = np.empty(len(y), dtype=classes.dtype)                y.fill(classes[0])                return y            else:                return classes[y.ravel()]    elif output_type == "multilabel-indicator":        return y    elif output_type == "multilabel-sequences":        warnings.warn('Direct support for sequence of sequences multilabel '                      'representation will be unavailable from version 0.17. '                      'Use sklearn.preprocessing.MultiLabelBinarizer to '                      'convert to a label indicator representation.',                      DeprecationWarning)        mlb = MultiLabelBinarizer(classes=classes).fit([])        return mlb.inverse_transform(y)    else:        raise ValueError("{0} format is not supported".format(output_type))class MultiLabelBinarizer(BaseEstimator, TransformerMixin):    """Transform between iterable of iterables and a multilabel format    Although a list of sets or tuples is a very intuitive format for multilabel    data, it is unwieldy to process. This transformer converts between this    intuitive format and the supported multilabel format: a (samples x classes)    binary matrix indicating the presence of a class label.    Parameters    ----------    classes : array-like of shape [n_classes] (optional)        Indicates an ordering for the class labels    sparse_output : boolean (default: False),        Set to true if output binary array is desired in CSR sparse format    Attributes    ----------    classes_ : array of labels        A copy of the `classes` parameter where provided,        or otherwise, the sorted set of classes found when fitting.    Examples    --------    >>> mlb = MultiLabelBinarizer()    >>> mlb.fit_transform([(1, 2), (3,)])    array([[1, 1, 0],           [0, 0, 1]])    >>> mlb.classes_    array([1, 2, 3])    >>> mlb.fit_transform([set(['sci-fi', 'thriller']), set(['comedy'])])    array([[0, 1, 1],           [1, 0, 0]])    >>> list(mlb.classes_)    ['comedy', 'sci-fi', 'thriller']    """    def __init__(self, classes=None, sparse_output=False):        self.classes = classes        self.sparse_output = sparse_output    def fit(self, y):        """Fit the label sets binarizer, storing `classes_`        Parameters        ----------        y : iterable of iterables            A set of labels (any orderable and hashable object) for each            sample. If the `classes` parameter is set, `y` will not be            iterated.        Returns        -------        self : returns this MultiLabelBinarizer instance        """        if self.classes is None:            classes = sorted(set(itertools.chain.from_iterable(y)))        else:            classes = self.classes        dtype = np.int if all(isinstance(c, int) for c in classes) else object        self.classes_ = np.empty(len(classes), dtype=dtype)        self.classes_[:] = classes        return self    def fit_transform(self, y):        """Fit the label sets binarizer and transform the given label sets        Parameters        ----------        y : iterable of iterables            A set of labels (any orderable and hashable object) for each            sample. If the `classes` parameter is set, `y` will not be            iterated.        Returns        -------        y_indicator : array or CSR matrix, shape (n_samples, n_classes)            A matrix such that `y_indicator[i, j] = 1` iff `classes_[j]` is in            `y[i]`, and 0 otherwise.        """        if self.classes is not None:            return self.fit(y).transform(y)        # Automatically increment on new class        class_mapping = defaultdict(int)        class_mapping.default_factory = class_mapping.__len__        yt = self._transform(y, class_mapping)        # sort classes and reorder columns        tmp = sorted(class_mapping, key=class_mapping.get)        # (make safe for tuples)        dtype = np.int if all(isinstance(c, int) for c in tmp) else object        class_mapping = np.empty(len(tmp), dtype=dtype)        class_mapping[:] = tmp        self.classes_, inverse = np.unique(class_mapping, return_inverse=True)        yt.indices = np.take(inverse, yt.indices)        if not self.sparse_output:            yt = yt.toarray()        return yt    def transform(self, y):        """Transform the given label sets        Parameters        ----------        y : iterable of iterables            A set of labels (any orderable and hashable object) for each            sample. If the `classes` parameter is set, `y` will not be            iterated.        Returns        -------        y_indicator : array or CSR matrix, shape (n_samples, n_classes)            A matrix such that `y_indicator[i, j] = 1` iff `classes_[j]` is in            `y[i]`, and 0 otherwise.        """        class_to_index = dict(zip(self.classes_, range(len(self.classes_))))        yt = self._transform(y, class_to_index)        if not self.sparse_output:            yt = yt.toarray()        return yt    def _transform(self, y, class_mapping):        """Transforms the label sets with a given mapping        Parameters        ----------        y : iterable of iterables        class_mapping : Mapping            Maps from label to column index in label indicator matrix        Returns        -------        y_indicator : sparse CSR matrix, shape (n_samples, n_classes)            Label indicator matrix        """        indices = array.array('i')        indptr = array.array('i', [0])        for labels in y:            indices.extend(set(class_mapping[label] for label in labels))            indptr.append(len(indices))        data = np.ones(len(indices), dtype=int)        return sp.csr_matrix((data, indices, indptr),                             shape=(len(indptr) - 1, len(class_mapping)))    def inverse_transform(self, yt):        """Transform the given indicator matrix into label sets        Parameters        ----------        yt : array or sparse matrix of shape (n_samples, n_classes)            A matrix containing only 1s ands 0s.        Returns        -------        y : list of tuples            The set of labels for each sample such that `y[i]` consists of            `classes_[j]` for each `yt[i, j] == 1`.        """        if yt.shape[1] != len(self.classes_):            raise ValueError('Expected indicator for {0} classes, but got {1}'                             .format(len(self.classes_), yt.shape[1]))        if sp.issparse(yt):            yt = yt.tocsr()            if len(yt.data) != 0 and len(np.setdiff1d(yt.data, [0, 1])) > 0:                raise ValueError('Expected only 0s and 1s in label indicator.')            return [tuple(self.classes_.take(yt.indices[start:end]))                    for start, end in zip(yt.indptr[:-1], yt.indptr[1:])]        else:            unexpected = np.setdiff1d(yt, [0, 1])            if len(unexpected) > 0:                raise ValueError('Expected only 0s and 1s in label indicator. '                                 'Also got {0}'.format(unexpected))            return [tuple(self.classes_.compress(indicators)) for indicators                    in yt]
0 0
原创粉丝点击