如颜色,使用内置函数hash()
import pandas as pd
def onehot_hash(df, column):
'''进行数值大小无意义的独热编码
:type df: pandas.core.frame.DataFrame
:type column: str
'''
D = 2 ** 24
df[column] = df[column].apply(lambda x: abs(hash(str(x)) % D))
if __name__ == '__main__':
df = pd.DataFrame([['red'], ['green'], ['blue'], ['red']])
df.columns = ['color']
onehot_hash(df, 'color')
print(df)
# color
# 0 704795
# 1 9649319
# 2 16714041
# 3 704795
如尺码,使用映射pandas.core.series.Series.map(arg, na_action=None)
import pandas as pd
def onehot_map(df, column, mapping):
'''进行数值大小有意义的独热编码
:type df: pandas.core.frame.DataFrame
:type column: str
:type mapping: dict
'''
df[column] = df[column].map(mapping)
if __name__ == '__main__':
df = pd.DataFrame([['M'], ['L'], ['XL']])
df.columns = ['size']
onehot_map(df, 'size', {'M': 1, 'L': 2, 'XL': 3})
print(df)
# size
# 0 1
# 1 2
# 2 3
将分类变量转换为哑变量
import pandas as pd
df = pd.DataFrame([['red'], ['blue'], ['green']])
df.columns = ['color']
print(pd.get_dummies(df))
# color_blue color_green color_red
# 0 0 0 1
# 1 1 0 0
# 2 0 1 0