import numpy as np

df = pd.DataFrame([

size_mapping = {

# -----------------------------------------------

df

x

[{'color': 'green', 'prize': 10.1, 'size': 1}, {'color': 'red', 'prize': 13.5, 'size': 2}, {'color': 'blue', 'prize': 15.300000000000001, 'size': 3}]

label_list

['class1', 'class2', 'class1']

from sklearn.feature_extraction import DictVectorizer

array([[  0. ,   1. ,   0. ,  10.1,   1. ],       [  0. ,   0. ,   1. ,  13.5,   2. ],       [  1. ,   0. ,   0. ,  15.3,   3. ]])

from sklearn import preprocessing

array([[0],       [1],       [0]])

# 测试 当 label 种类大于 2 的时候的效果

C:\Users\rHotD\Anaconda3\lib\site-packages\ipykernel\__main__.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrameSee the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy  from ipykernel import kernelapp as app

feature_list = []

array([[1, 0, 0],       [0, 1, 0],       [0, 0, 1]])

# 结论，两者效果差不多一样，但是 pd.get_dummies 更好用一些

数据预处理（2）—— One-hot coding 独热编码#分别使用 pandans.dummies 和 sklearn.feature_extraction.DictVectorizer 进行处理

离散 feature 的 encoding 分为两种情况：