import pandas as pd
c1=pd.read_excel('C:/Users/50502/Desktop/Python项目/02-百货商场data/data/cumcm2018c1.xlsx')
c2=pd.read_csv('C:/Users/50502/Desktop/Python项目/02-百货商场data/data/cumcm2018c2.csv')
D:\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3057: DtypeWarning: Columns (0,11) have mixed types. Specify dtype option on import or set low_memory=False.
interactivity=interactivity, compiler=compiler, result=result)
c1.head()
kh | csrq | xb | djsj | |
---|---|---|---|---|
0 | c68b20b4 | 2002-11-02 00:00:00 | 0.0 | 2013-05-11 00:00:00.000 |
1 | 1ca15332 | NaN | 0.0 | 2004-11-04 16:31:52.436 |
2 | a37cc182 | 1967-02-17 00:00:00 | 0.0 | 2004-12-31 21:24:34.216 |
3 | 2ab88539 | 1982-06-01 00:00:00 | 0.0 | 2010-11-19 00:00:00.000 |
4 | b4c77269 | 1964-02-05 00:00:00 | 0.0 | 2007-12-14 00:00:00.000 |
c2.head()
kh | dtime | spbm | sl | sj | je | spmc | jf | syjh | djh | gzbm | gzmc | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1be1e3fe | 2015-01-01 00:05:41.593 | f09c9303 | 1 | 290.0 | 270.20 | 兰芝化妆品正价瓶 | 270.20 | 6 | 25bb | 8077.0 | 兰芝柜 |
1 | 1be1e3fe | 2015-01-01 00:05:41.593 | f09c9303 | 1 | 325.0 | 302.80 | 兰芝化妆品正价瓶 | 302.80 | 6 | 25bb | 8077.0 | 兰芝柜 |
2 | 1be1e3fe | 2015-01-01 00:05:41.593 | f09c9303 | 1 | 195.0 | 181.80 | 兰芝化妆品正价瓶 | 181.80 | 6 | 25bb | 8077.0 | 兰芝柜 |
3 | 1be1e3fe | 2015-01-01 00:05:41.593 | f09c9303 | 1 | 270.0 | 251.55 | 兰芝化妆品正价瓶 | 251.55 | 6 | 25bb | 8077.0 | 兰芝柜 |
4 | 1be1e3fe | 2015-01-01 00:05:41.593 | f09c9303 | 2 | 245.0 | 456.55 | 兰芝化妆品正价瓶 | 456.55 | 6 | 25bb | 8077.0 | 兰芝柜 |
#去重
c1=c1.drop_duplicates()
c2=c2.drop_duplicates()
#合并数据集
c = pd.merge(c1,c2)
c.head()
kh | csrq | xb | djsj | dtime | spbm | sl | sj | je | spmc | jf | syjh | djh | gzbm | gzmc | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | a37cc182 | 1967-02-17 00:00:00 | 0.0 | 2004-12-31 21:24:34.216 | 2016-09-25 10:49:14.016 | d62a69e3 | 1 | 499.0 | 297.07 | WMF D无 | 297.08 | 101 | 7cd8 | 7296.0 | WMF 柜 |
1 | a37cc182 | 1967-02-17 00:00:00 | 0.0 | 2004-12-31 21:24:34.216 | 2016-09-25 10:49:14.016 | d62a69e3 | 1 | 12288.0 | 11089.93 | WMF D无 | 11089.92 | 101 | 7cd8 | 7296.0 | WMF 柜 |
2 | a37cc182 | 1967-02-17 00:00:00 | 0.0 | 2004-12-31 21:24:34.216 | 2017-02-28 17:17:35.533 | 252403ef | 1 | 598.0 | 598.00 | 双立人商品 F无 | 598.00 | 102 | ed0a | 7242.0 | 双立人柜 |
3 | 34c2dc93 | 1986-07-18 00:00:00 | 0.0 | 2007-04-19 00:00:00.000 | 2017-11-13 13:36:45.656 | 4708e0bb | 1 | 260.0 | 260.00 | 植村秀三色眼影/眼线液/眉笔. | 260.00 | 97 | c3f4 | 8156.0 | Shu Uemura |
4 | 828aa9b4 | 1951-12-29 00:00:00 | 0.0 | 2004-09-24 16:39:08.716 | 2017-05-17 11:30:39.080 | a4809307 | 1 | 1288.0 | 1288.00 | 酷彩F件 | 1288.00 | 102 | 95b0 | 7300.0 | Le Creuset(酷彩) |
# 查看数据的形状
print("数据的形状:", c.shape)
# 查看每列数据的非空值的个数和数据类型
print("每列数据的非空值的个数和数据类型:\n", c.info())
# 查看每列数据的描述统计信息
print("数据的描述统计信息:\n", c.describe())
# 查看每列数据的空值个数
print("每列数据的空值个数:\n", c.isnull().sum())
数据的形状: (506006, 15)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 506006 entries, 0 to 506005
Data columns (total 15 columns):
kh 506006 non-null object
csrq 482173 non-null object
xb 495568 non-null float64
djsj 474547 non-null datetime64[ns]
dtime 506006 non-null object
spbm 506006 non-null object
sl 506006 non-null int64
sj 506006 non-null float64
je 506006 non-null float64
spmc 506006 non-null object
jf 506006 non-null float64
syjh 506006 non-null int64
djh 506006 non-null object
gzbm 506006 non-null float64
gzmc 492463 non-null object
dtypes: datetime64[ns](1), float64(5), int64(2), object(7)
memory usage: 61.8+ MB
每列数据的非空值的个数和数据类型:
None
数据的描述统计信息:
xb sl sj je \
count 495568.000000 506006.000000 5.060060e+05 5.060060e+05
mean 0.098372 1.054300 1.430813e+03 1.356862e+03
std 0.297817 2.366001 3.695847e+03 3.706632e+03
min 0.000000 -30.000000 1.000000e-01 -6.712575e+05
25% 0.000000 1.000000 3.400000e+02 3.200000e+02
50% 0.000000 1.000000 7.200000e+02 6.960000e+02
75% 0.000000 1.000000 1.622000e+03 1.577430e+03
max 1.000000 1492.000000 1.342515e+06 1.342515e+06
jf syjh gzbm
count 506006.000000 506006.000000 506006.000000
mean 1030.375453 111.340160 6296.818781
std 2125.932871 82.427647 2363.259002
min -201616.000000 1.000000 0.000000
25% 145.000000 43.000000 4334.000000
50% 500.000000 97.000000 8064.000000
75% 1270.000000 170.000000 8121.000000
max 268503.000000 320.000000 9005.000000
每列数据的空值个数:
kh 0
csrq 23833
xb 10438
djsj 31459
dtime 0
spbm 0
sl 0
sj 0
je 0
spmc 0
jf 0
syjh 0
djh 0
gzbm 0
gzmc 13543
dtype: int64
c.describe()
xb | sl | sj | je | jf | syjh | gzbm | |
---|---|---|---|---|---|---|---|
count | 495568.000000 | 506006.000000 | 5.060060e+05 | 5.060060e+05 | 506006.000000 | 506006.000000 | 506006.000000 |
mean | 0.098372 | 1.054300 | 1.430813e+03 | 1.356862e+03 | 1030.375453 | 111.340160 | 6296.818781 |
std | 0.297817 | 2.366001 | 3.695847e+03 | 3.706632e+03 | 2125.932871 | 82.427647 | 2363.259002 |
min | 0.000000 | -30.000000 | 1.000000e-01 | -6.712575e+05 | -201616.000000 | 1.000000 | 0.000000 |
25% | 0.000000 | 1.000000 | 3.400000e+02 | 3.200000e+02 | 145.000000 | 43.000000 | 4334.000000 |
50% | 0.000000 | 1.000000 | 7.200000e+02 | 6.960000e+02 | 500.000000 | 97.000000 | 8064.000000 |
75% | 0.000000 | 1.000000 | 1.622000e+03 | 1.577430e+03 | 1270.000000 | 170.000000 | 8121.000000 |
max | 1.000000 | 1492.000000 | 1.342515e+06 | 1.342515e+06 | 268503.000000 | 320.000000 | 9005.000000 |
c.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 506006 entries, 0 to 506005
Data columns (total 15 columns):
kh 506006 non-null object
csrq 482173 non-null object
xb 495568 non-null float64
djsj 474547 non-null datetime64[ns]
dtime 506006 non-null object
spbm 506006 non-null object
sl 506006 non-null int64
sj 506006 non-null float64
je 506006 non-null float64
spmc 506006 non-null object
jf 506006 non-null float64
syjh 506006 non-null int64
djh 506006 non-null object
gzbm 506006 non-null float64
gzmc 492463 non-null object
dtypes: datetime64[ns](1), float64(5), int64(2), object(7)
memory usage: 61.8+ MB
# 查看销售日期列数据中每个值的个数
c.dtime.value_counts()
2017-11-25 15:36:23.626 54
2015-04-25 11:16:56.140 45
2015-03-02 12:12:11.923 38
2015-04-26 13:19:38.080 37
2016-06-24 14:27:57.830 37
2016-09-23 14:50:44.063 36
2015-04-24 18:47:34.390 35
2015-06-03 16:07:30.813 34
2016-12-31 16:45:42.343 33
2015-04-26 17:40:18.063 33
2015-04-24 17:03:14.423 32
2017-11-26 16:02:26.936 32
2017-08-27 17:29:24.376 31
2016-11-25 12:19:11.720 31
2016-08-27 12:13:19.203 31
2015-04-26 11:20:21.813 30
2015-01-10 17:13:34.343 30
2015-04-24 15:17:28.390 30
2016-09-23 18:09:19.796 30
2015-08-17 13:44:25.686 30
2017-05-12 16:14:14.360 29
2015-01-01 11:52:18.890 29
2017-09-23 15:48:34.186 29
2017-10-08 16:41:18.283 28
2015-04-24 17:17:20.110 27
2017-08-26 15:06:12.046 27
2015-04-25 15:24:03.436 27
2017-11-26 15:05:00.516 27
2015-02-16 17:28:15.906 26
2017-05-12 18:59:47.906 26
..
2016-11-04 14:03:26.453 1
2016-03-30 16:48:55.110 1
2017-10-19 17:27:16.126 1
2016-05-24 15:16:28.936 1
2017-05-23 14:34:51.546 1
2015-04-24 16:33:23.423 1
2017-01-19 17:12:30.610 1
2015-06-24 12:10:02.813 1
2016-07-26 19:34:21.330 1
2015-04-02 20:47:07.453 1
2016-01-09 16:42:14.156 1
2017-05-08 13:52:32.266 1
2017-06-18 16:39:42.436 1
2015-01-16 16:54:14.063 1
2017-05-05 13:18:11.686 1
2017-02-03 18:28:31.203 1
2017-05-19 12:08:02.500 1
2017-09-17 15:24:35.500 1
2017-08-17 12:41:08.626 1
2015-07-13 11:04:57.796 1
2015-04-30 17:17:53.110 1
2017-07-05 18:30:27.813 1
2016-06-28 12:15:22.093 1
2015-07-27 12:07:34.093 1
2015-05-02 19:00:01.626 1
2017-10-13 13:44:54.173 1
2017-09-22 17:02:49.360 1
2015-02-01 17:17:37.580 1
2017-04-08 12:51:00.110 1
2016-06-28 11:58:35.296 1
Name: dtime, Length: 264022, dtype: int64
## 查看销售日期列数据中每年的个数
import datetime as dt
c['dtime']=pd.to_datetime(c['dtime'],errors='coerce')
c.dtime.dt.year.value_counts()
2017 231373
2016 146958
2015 123852
2018 3823
Name: dtime, dtype: int64
# 数据去重
new_c = c.drop_duplicates()
print(new_c.shape)
(506006, 15)
# 去除数据中的空值
c = new_c.dropna()
c.shape
(445894, 15)
c.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 445894 entries, 0 to 506005
Data columns (total 15 columns):
kh 445894 non-null object
csrq 445894 non-null object
xb 445894 non-null float64
djsj 445894 non-null datetime64[ns]
dtime 445894 non-null datetime64[ns]
spbm 445894 non-null object
sl 445894 non-null int64
sj 445894 non-null float64
je 445894 non-null float64
spmc 445894 non-null object
jf 445894 non-null float64
syjh 445894 non-null int64
djh 445894 non-null object
gzbm 445894 non-null float64
gzmc 445894 non-null object
dtypes: datetime64[ns](2), float64(5), int64(2), object(6)
memory usage: 54.4+ MB
#定义年龄0-44为青年,45-59为中年人,60以上为老年人
#在c表中csrq
import datetime as dt
#提取年数
c['csrq']=pd.to_datetime(c['csrq'],errors='coerce')
c['year'] = c['csrq'].dt.year
#年龄计算
now_year = dt.datetime.now().year
c['age'] = now_year - c['year']
#年龄阶层
bins = [0,44,59,120]
labels = ['青年 ','中年人', '老年人']
c['年龄阶层'] = pd.cut(c['age'], bins, labels=labels)
#季节
c['month']=c['dtime'].dt.month
bins = [0,2.9,5,8,11,12.1]
labels = ['冬季 ','春季', '夏季', '秋季','冬季']
#labels = ['Winter ','Spring', 'Summer', 'Autumn','Winter']
c['季节'] = pd.cut(c['month'], bins, labels=labels)
#入会时长数据定义、处理与描述
c['djsj']=pd.to_datetime(c['djsj'],errors='coerce')
c['入会时长']=dt.datetime.now().year-c['djsj'].dt.year
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:5: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
"""
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:6: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:9: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
if __name__ == '__main__':
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:13: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
del sys.path[0]
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:15: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
from ipykernel import kernelapp as app
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:19: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:21: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:22: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
c.head()
kh | csrq | xb | djsj | dtime | spbm | sl | sj | je | spmc | ... | syjh | djh | gzbm | gzmc | year | age | 年龄阶层 | month | 季节 | 入会时长 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | a37cc182 | 1967-02-17 | 0.0 | 2004-12-31 21:24:34.216 | 2016-09-25 10:49:14.016 | d62a69e3 | 1 | 499.0 | 297.07 | WMF D无 | ... | 101 | 7cd8 | 7296.0 | WMF 柜 | 1967 | 53 | 中年人 | 9 | 秋季 | 16 |
1 | a37cc182 | 1967-02-17 | 0.0 | 2004-12-31 21:24:34.216 | 2016-09-25 10:49:14.016 | d62a69e3 | 1 | 12288.0 | 11089.93 | WMF D无 | ... | 101 | 7cd8 | 7296.0 | WMF 柜 | 1967 | 53 | 中年人 | 9 | 秋季 | 16 |
2 | a37cc182 | 1967-02-17 | 0.0 | 2004-12-31 21:24:34.216 | 2017-02-28 17:17:35.533 | 252403ef | 1 | 598.0 | 598.00 | 双立人商品 F无 | ... | 102 | ed0a | 7242.0 | 双立人柜 | 1967 | 53 | 中年人 | 2 | 冬季 | 16 |
3 | 34c2dc93 | 1986-07-18 | 0.0 | 2007-04-19 00:00:00.000 | 2017-11-13 13:36:45.656 | 4708e0bb | 1 | 260.0 | 260.00 | 植村秀三色眼影/眼线液/眉笔. | ... | 97 | c3f4 | 8156.0 | Shu Uemura | 1986 | 34 | 青年 | 11 | 秋季 | 13 |
4 | 828aa9b4 | 1951-12-29 | 0.0 | 2004-09-24 16:39:08.716 | 2017-05-17 11:30:39.080 | a4809307 | 1 | 1288.0 | 1288.00 | 酷彩F件 | ... | 102 | 95b0 | 7300.0 | Le Creuset(酷彩) | 1951 | 69 | 老年人 | 5 | 春季 | 16 |
5 rows × 21 columns
c.age.describe()
count 445894.000000
mean 63.031622
std 62.198657
min 3.000000
25% 38.000000
50% 46.000000
75% 53.000000
max 267.000000
Name: age, dtype: float64
#去除不符合逻辑的值:删除年龄大于>120的行(认为会员最高年龄不超过120随)
c['age'] = c['age'].drop(c['age'][c['age'] > 120].index)
#去除不符合逻辑的值:删除消费金额小于0的行
c['je']=c['je'].drop(c['je'][c['je'] < 0].index)
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:4: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
after removing the cwd from sys.path.
# c.rename(
# columns={'kh':'卡号', 'dtime':'消费时间', 'spbm':'商品编码', 'sl':'数量',
# 'sj':'售价', 'je':'实际消费金额', 'spmc':'商品名称', 'jf':'积分',
# 'syjh':'收银机号', 'djh':'单据号', 'gzbm':'柜组编码',
# 'gzmc':'柜组名称'}, inplace = True)
import numpy as np
c.groupby(by=["spmc"])["je"].agg({"各类商品销售金额":np.sum})
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: FutureWarning: using a dict on a Series for aggregation
is deprecated and will be removed in a future version
各类商品销售金额 | |
---|---|
spmc | |
.雷达钟表无 | 1370300.52 |
23区女装系列A件 | 1655993.77 |
23区女装系列B.5件 | 253656.22 |
23区女装系列B件 | 280106.95 |
23区女装系列C件 | 258243.90 |
23区女装系列D件 | 63571.14 |
23区女装系列E件 | 105316.01 |
23区女装系列F件 | 1426.00 |
23区女装系列正价件 | 14810.40 |
3S 系列B无 | 1599.00 |
3S 系列C无 | 1399.00 |
5+E | 285.00 |
5+E. | 335.00 |
5+服饰A件 | 1521.00 |
5+服饰C件 | 978.00 |
5+服饰E件 | 445.00 |
6F-NIKEKIDS正价件 | 199.00 |
6F欣欣玩具正价个 | 713.00 |
8848 正价个 | 270080.00 |
A.O.史密斯 正价件 | 146028.04 |
ABLE JEANS正价件 | 1399.00 |
AD 正价无 | 799.00 |
ADSE 正价无 | 699.00 |
ADSE 系列A无 | 508.00 |
AD集合店正价 | 1996.00 |
AO2F件 | 5615.00 |
AO2正价件 | 1434.00 |
AOJO 正价副 | 549.00 |
APM D无 | 48688.00 |
APM 正价无 | 2364639.24 |
... | ... |
香奈儿限量35ml邂逅35ml | 6077.00 |
香奈儿青春光彩保湿粉凝霜. | 16182.50 |
香奈儿青春光彩柔润粉芯. | 22616.94 |
香奈儿青春光彩柔润粉饼13g | 281251.06 |
香奈儿青春光彩水润粉底液30ml | 2899.16 |
香奈儿魅力润体乳200ml | 1010.00 |
香水 正价无 | 95674.49 |
马天奴 A件 | 2788.00 |
骆驼新B* | 670.00 |
高美高手袋系列A个 | 14881.11 |
高美高手袋系列B个 | 51714.90 |
高美高手袋系列F个 | 3397.00 |
高美高手袋系列正价个 | 2088.00 |
高美高正价* | 1888.00 |
高美高特卖个 | 5568.00 |
高美高系列正价个 | 479329.78 |
魅可唇膏支 | 510.00 |
魅可持久防水眼线笔支 | 200.00 |
魅可新式卷笔刀支 | 40.00 |
魅可时尚焦点小眼影(片)支 | 115.00 |
魅可焦点小眼影支 | 155.00 |
黄金金条件 | 37978.20 |
黄金饰品克 | 105375.00 |
黛安芬内衣系列3件 | 36.00 |
黛安芬内衣系列B件 | 47239.81 |
黛安芬内衣系列C件 | 24655.99 |
黛安芬内衣系列E件 | 2124.00 |
黛安芬内衣系列F件 | 2647.92 |
黛安芬内衣系列件 | 475907.51 |
黛安芬内衣系列员购件 | 8669.00 |
5936 rows × 1 columns
je = c.groupby(by=["spmc"])["je"].agg({"sum":np.sum})[0:5]
#index = range(len(je))
index = range(5)
plt.bar(index, je["sum"], color="dodgerblue")
plt.xticks(index, je.index)
plt.show()
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: using a dict on a Series for aggregation
is deprecated and will be removed in a future version
"""Entry point for launching an IPython kernel.
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-59b1cBLf-1584007168716)(output_18_1.png)]
import matplotlib.pyplot as plt
qn=c.age[(c.age < 45)].shape[0]
zn=c.age[(c.age < 60)&(c.age >= 45)].shape[0]
ln=c.age[(c.age >= 60)].shape[0]
import matplotlib.pyplot as plt
labels=('Young','Middle-aged','Elderly')
sizes=(qn,zn,ln)
colors=['lightgreen','gold','lightskyblue']
explode=0,0,0.1
plt.pie(sizes,explode=explode,labels=labels,
colors=colors,autopct='%1.1f%%',shadow=True,startangle=50)
plt.axis('equal')
plt.title('Distribution of member age')
plt.show()
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-XOo9KI81-1584007168721)(output_19_0.png)]
#性别人数比例和不同性别消费金额比例
#性别,0女,1男
#性别比例图
#年龄占比图
import matplotlib.pyplot as plt
Female=c['xb'][(c['xb'] == 0)].shape[0]
Male=c['xb'][(c['xb'] == 1)].shape[0]
labels=('Female','Male')
sizes=(Female,Male)
colors=['lightgreen','gold']
explode=0,0.1
plt.pie(sizes,explode=explode,labels=labels,
colors=colors,autopct='%1.1f%%',shadow=True,startangle=50)
plt.axis('equal')
plt.title('Distribution of member sex')
plt.show()
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-XPB7UnkN-1584007168723)(output_20_0.png)]
#不同年龄阶层消费总额占比
qne = c[(c.age < 45)]['je'].sum()
zne = c[(c.age < 60)&(c.age >= 45)]['je'].sum()
lne = c[(c.age >= 60)]['je'].sum()
labels=('Young','Middle-aged','Elderly')
sizes=(qn,zn,ln)
colors=['lightgreen','gold','lightskyblue']
explode=0,0,0.1
plt.pie(sizes,explode=explode,labels=labels,
colors=colors,autopct='%1.1f%%',shadow=True,startangle=50)
plt.axis('equal')
plt.title('Distribution of different age member''s payment')
plt.show()
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-muUEQQEd-1584007168727)(output_21_0.png)]
#不同性别消费金额占比
import matplotlib.pyplot as plt
Female=c[(c['xb'] == 0)]['je'].sum()
Male=c[(c['xb'] == 1)]['je'].sum()
labels=('Female','Male')
sizes=(Female,Male)
colors=['lightskyblue','gold']
explode=0,0.1
plt.pie(sizes,explode=explode,labels=labels,
colors=colors,autopct='%1.1f%%',shadow=True,startangle=50)
plt.axis('equal')
plt.title('Distribution of different members sex payment')
plt.show()
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-AyHHa9WB-1584007168729)(output_22_0.png)]
#会员订单数=会员字段非空且单据号字段非空的行数;(剔除不符合逻辑的消费金额小于零值)
hy=c2[(c2['kh'].isnull()==False)&(c2['djh'].isnull()==False)&(c2['je']>=0)]
#非会员订单数=会员字段为空但单据号字段非空的行数;(剔除不符合逻辑的消费金额小于零值)
fhy=c2[(c2['kh'].isnull()==True)&(c2['djh'].isnull()==False)&(c2['je']>=0)]
# #提取年数
# hy['csrq']=pd.to_datetime(hy['csrq'],errors='coerce')
# hy['year'] = hy['csrq'].dt.year
# fhy['csrq']=pd.to_datetime(fhy['csrq'],errors='coerce')
# fhy['year'] = fhy['csrq'].dt.year
# #年龄计算
# now_year = dt.datetime.now().year
# hy['age'] = now_year - hy['year']
# fhy['age'] = now_year - fhy['year']
# #去除不符合逻辑的值:删除年龄大于>120的行(认为会员最高年龄不超过120随)
# hy['age'] = hy['age'].drop(hy['age'][hy['age'] > 120].index)
# fhy['age'] = fhy['age'].drop(fhy['age'][fhy['age'] > 120].index)
#c2['kh'](c2['kh'].isnull()==True)
num_hy=hy.iloc[:,0].size
num_fhy=fhy.iloc[:,0].size
print (num_hy,num_fhy)
869072 997811
#会员与非会员的订单数饼图
import matplotlib.pyplot as plt
labels=('Member orders','Non-member orders')
sizes=(num_hy,num_fhy)
colors=['lightgreen','gold']
explode=0,0
plt.pie(sizes,explode=explode,labels=labels,
colors=colors,autopct='%1.1f%%',shadow=True,startangle=50)
plt.axis('equal')
plt.title('Distribution of member orders')
plt.show()
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-Srh33bjN-1584007168731)(output_24_0.png)]
#会员与非会员的消费金额总数饼图
import matplotlib.pyplot as plt
labels=('Member orders','Non-member orders')
sizes=(hy['je'].sum(),fhy['je'].sum())
colors=['lightblue','gold']
explode=0,0
plt.pie(sizes,explode=explode,labels=labels,
colors=colors,autopct='%1.1f%%',shadow=True,startangle=50)
plt.axis('equal')
plt.title('Distribution of member orders')
plt.show()
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-jERIprf7-1584007168740)(output_25_0.png)]
#2015-2018各年会员消费总额
#对hy表格增加年、月两列,删除除je、year、month的列
hy['dtime']=pd.to_datetime(hy['dtime'],errors='coerce')
hyy=hy
hyy['year']=hyy['dtime'].dt.year
hyy['month']=hyy['dtime'].dt.month
hyy=hyy.drop(['kh', 'dtime','spbm','sl','sj','spmc','jf','syjh','djh','gzbm','gzmc'], axis=1)
#会员2015-2018各年会员消费总额
import numpy as np
je_counts=hyy.groupby(by=["year"])["je"].agg({"各年销售金额":np.sum})
#饼图
plt.pie(je_counts, labels=je_counts.index,explode=[0.1,0,0.1,0.1],
colors=['lightblue','gold','lightgreen','lightcoral'],
autopct='%.2f%%',shadow=True,startangle=50)
plt.axis('equal')
plt.title('Distribution of yearly payment')
plt.show()
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
This is separate from the ipykernel package so we can avoid doing imports until
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:5: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
"""
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:6: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:10: FutureWarning: using a dict on a Series for aggregation
is deprecated and will be removed in a future version
# Remove the CWD from sys.path while we load stuff.
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:14: MatplotlibDeprecationWarning: Non-1D inputs to pie() are currently squeeze()d, but this behavior is deprecated since 3.1 and will be removed in 3.3; pass a 1D array instead.
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-tUu3vx9P-1584007168751)(output_26_1.png)]
#2015-2018年每个月的消费金额
c['dtime']=pd.to_datetime(c['dtime'],errors='coerce')
for i in range(2015, 2019):
monthly = []
for j in range(0,13):
monthly.append(c[(c['dtime'].dt.year == i) & (c['dtime'].dt.month == j)]['je'].sum())
plt.plot(range(0,len(monthly)), monthly,color=(0.4, 0.8, 0.6))
plt.xlabel("Month")
plt.ylabel("Amount")
plt.grid(True)
plt.title('The consumption in '+str(i))
plt.show()
# plt.plot(sl_counts.index, sl_counts, "-", color=(0.4, 0.8, 0.6))
# plt.xlabel("日期")
# plt.xticks(rotation=15)
# plt.ylabel("销量")
# plt.grid(True)
# plt.title("折线图")
# plt.show()
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-IGATV2CI-1584007168765)(output_27_1.png)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-yrT9weJs-1584007168776)(output_27_2.png)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-8O0NO7gq-1584007168780)(output_27_3.png)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-uJ5IRmQ4-1584007168786)(output_27_4.png)]
#各时段消费人数
zs=c['dtime'][(c['dtime'].dt.hour >= 5)&(c['dtime'].dt.hour < 11)].shape[0]
zw=c['dtime'][(c['dtime'].dt.hour >= 11)&(c['dtime'].dt.hour < 14)].shape[0]
xw=c['dtime'][(c['dtime'].dt.hour >= 14)&(c['dtime'].dt.hour < 19)].shape[0]
ws=c['dtime'][(c['dtime'].dt.hour >= 19)&(c['dtime'].dt.hour < 24)].shape[0]
lc=c['dtime'][(c['dtime'].dt.hour >= 0)&(c['dtime'].dt.hour < 5)].shape[0]
print ('早上:05:00:00~10:59:59订单数量:%d'%zs)
print ('中午:11:00:00~13:59:59订单数量:%d'%zw)
print ('下午:14:00:00~18:59:59订单数量:%d'%xw)
print ('晚上:19:00:00~23:59:59订单数量:%d'%ws)
print ('凌晨:00:00:00~04:59:59订单数量:%d'%lc)
早上:05:00:00~10:59:59订单数量:17833
中午:11:00:00~13:59:59订单数量:104883
下午:14:00:00~18:59:59订单数量:223950
晚上:19:00:00~23:59:59订单数量:99136
凌晨:00:00:00~04:59:59订单数量:92
#各时段人数可视化图
import matplotlib.pyplot as plt
labels=('Morning','Noon','Early morning','Afternoon','Evening')
sizes=(zs,zw,lc,xw,ws)
explode=[0,0.1,0,0.1,0.1]
plt.pie(sizes,explode=explode,labels=labels,colors=['lightblue','gold','lightgreen','lightcoral','lightgreen'],
autopct='%1.1f%%',shadow=True,startangle=50)
plt.axis('equal')
plt.title('Distribution of member time')
plt.show()
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-N8C60Yuv-1584007168789)(output_29_0.png)]
#春季3、4、5月;夏季6、7、8月;秋季9、10、11月;冬季12、1、2月
Spring=c['dtime'][(c['dtime'].dt.month >= 3)&(c['dtime'].dt.month <= 5)].shape[0]
Summer=c['dtime'][(c['dtime'].dt.month >= 6)&(c['dtime'].dt.month <= 8)].shape[0]
Autumn=c['dtime'][(c['dtime'].dt.month >= 9)&(c['dtime'].dt.month <= 11)].shape[0]
Winter=c['dtime'][(c['dtime'].dt.month >= 12)&(c['dtime'].dt.month <= 2)].shape[0]
print ('春季3、4、5月订单数量:%d'%Spring)
print ('夏季6、7、8月订单数量:%d'%Summer)
print ('秋季9、10、11月订单数量:%d'%Autumn)
print ('冬季12、1、2月订单数量:%d'%Winter)
春季3、4、5月订单数量:132681
夏季6、7、8月订单数量:118389
秋季9、10、11月订单数量:100197
冬季12、1、2月订单数量:0
#各季节会员消费人数
import matplotlib.pyplot as plt
labels=('Spring','Summer','Autumn','Winter')
sizes=(Spring,Summer,Autumn,Winter)
explode=[0.1,0,0.1,0]
colors=['lightgreen','gold','lightskyblue','lightcoral']
plt.pie(sizes,explode=explode,labels=labels,colors=colors
,autopct='%.2f%%',shadow=True,startangle=50)
plt.axis('equal')
plt.title('Distribution of season')
plt.show()
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-EwYd2g5S-1584007168790)(output_31_0.png)]
c.head()
kh | csrq | xb | djsj | dtime | spbm | sl | sj | je | spmc | ... | syjh | djh | gzbm | gzmc | year | age | 年龄阶层 | month | 季节 | 入会时长 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | a37cc182 | 1967-02-17 | 0.0 | 2004-12-31 21:24:34.216 | 2016-09-25 10:49:14.016 | d62a69e3 | 1 | 499.0 | 297.07 | WMF D无 | ... | 101 | 7cd8 | 7296.0 | WMF 柜 | 1967 | 53.0 | 中年人 | 9 | 秋季 | 16 |
1 | a37cc182 | 1967-02-17 | 0.0 | 2004-12-31 21:24:34.216 | 2016-09-25 10:49:14.016 | d62a69e3 | 1 | 12288.0 | 11089.93 | WMF D无 | ... | 101 | 7cd8 | 7296.0 | WMF 柜 | 1967 | 53.0 | 中年人 | 9 | 秋季 | 16 |
2 | a37cc182 | 1967-02-17 | 0.0 | 2004-12-31 21:24:34.216 | 2017-02-28 17:17:35.533 | 252403ef | 1 | 598.0 | 598.00 | 双立人商品 F无 | ... | 102 | ed0a | 7242.0 | 双立人柜 | 1967 | 53.0 | 中年人 | 2 | 冬季 | 16 |
3 | 34c2dc93 | 1986-07-18 | 0.0 | 2007-04-19 00:00:00.000 | 2017-11-13 13:36:45.656 | 4708e0bb | 1 | 260.0 | 260.00 | 植村秀三色眼影/眼线液/眉笔. | ... | 97 | c3f4 | 8156.0 | Shu Uemura | 1986 | 34.0 | 青年 | 11 | 秋季 | 13 |
4 | 828aa9b4 | 1951-12-29 | 0.0 | 2004-09-24 16:39:08.716 | 2017-05-17 11:30:39.080 | a4809307 | 1 | 1288.0 | 1288.00 | 酷彩F件 | ... | 102 | 95b0 | 7300.0 | Le Creuset(酷彩) | 1951 | 69.0 | 老年人 | 5 | 春季 | 16 |
5 rows × 21 columns
##任务3.1:构建会员用户“基本”特征标签
#固有基础属性标签:性别、年龄阶层、入会时长
c['性别']=c['xb'].map({1.0:'男',0.0:'女'})
# c['年龄']=aaa.replace("黑色","黄色")
im=c.drop(['csrq','xb','djsj','dtime','spbm','sl','sj',
'syjh','djh','gzbm','gzmc','year','age','month'], axis=1)
im.head()
kh | je | spmc | jf | 年龄阶层 | 季节 | 入会时长 | 性别 | 消费水平 | 新老会员 | |
---|---|---|---|---|---|---|---|---|---|---|
0 | a37cc182 | 297.07 | WMF D无 | 297.08 | 中年人 | 秋季 | 16 | 女 | 低消费 | 老会员 |
1 | a37cc182 | 11089.93 | WMF D无 | 11089.92 | 中年人 | 秋季 | 16 | 女 | 高消费 | 老会员 |
2 | a37cc182 | 598.00 | 双立人商品 F无 | 598.00 | 中年人 | 冬季 | 16 | 女 | 中等消费 | 老会员 |
3 | 34c2dc93 | 260.00 | 植村秀三色眼影/眼线液/眉笔. | 260.00 | 青年 | 秋季 | 13 | 女 | 低消费 | 老会员 |
4 | 828aa9b4 | 1288.00 | 酷彩F件 | 1288.00 | 老年人 | 春季 | 16 | 女 | 中等消费 | 老会员 |
#任务3.2:构建会员用户“业务”特征标签
#基础性息处理后得到标签:消费水平(300以下低消费;300-1500中等消费;1500以上高消费)
#入会时长:新老会员;消费水平:低中高;有价值的用户信息
#消费金额数据描述
im['je'].describe()
im['入会时长'].describe()
count 445894.000000
mean 6.322101
std 2.989830
min 2.000000
25% 4.000000
50% 5.000000
75% 8.000000
max 18.000000
Name: 入会时长, dtype: float64
im['消费水平'] = pd.cut(im['je'], bins=[0,300,1500,9999999], labels=['低消费','中等消费', '高消费'] )
im['新老会员'] = pd.cut(im['入会时长'], bins=[0,5,999], labels=['新会员','老会员'] )
im.head()
kh | je | jf | 年龄阶层 | 季节 | 入会时长 | 性别 | 消费水平 | 新老会员 | |
---|---|---|---|---|---|---|---|---|---|
0 | a37cc182 | 297.07 | 297.08 | 中年人 | 秋季 | 16 | 女 | 低消费 | 老会员 |
1 | a37cc182 | 11089.93 | 11089.92 | 中年人 | 秋季 | 16 | 女 | 高消费 | 老会员 |
2 | a37cc182 | 598.00 | 598.00 | 中年人 | 冬季 | 16 | 女 | 中等消费 | 老会员 |
3 | 34c2dc93 | 260.00 | 260.00 | 青年 | 秋季 | 13 | 女 | 低消费 | 老会员 |
4 | 828aa9b4 | 1288.00 | 1288.00 | 老年人 | 春季 | 16 | 女 | 中等消费 | 老会员 |
#任务3.3:构建会员用户“偏好”特征标签
#用户行为推测标签:经常购买运动T、裤、鞋为体育标签;购物行为——怀孕标签
#购物时间段、季节、商品类型偏好
im['购物时间段'] = pd.cut(c['dtime'].dt.hour, bins=[0,5,11,14,19,24], labels=['凌晨','早上', '中午','下午','晚上'] )
im.head()
kh | je | spmc | jf | 年龄阶层 | 季节 | 入会时长 | 性别 | 消费水平 | 新老会员 | 购物时间段 | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | a37cc182 | 297.07 | WMF D无 | 297.08 | 中年人 | 秋季 | 16 | 女 | 低消费 | 老会员 | 早上 |
1 | a37cc182 | 11089.93 | WMF D无 | 11089.92 | 中年人 | 秋季 | 16 | 女 | 高消费 | 老会员 | 早上 |
2 | a37cc182 | 598.00 | 双立人商品 F无 | 598.00 | 中年人 | 冬季 | 16 | 女 | 中等消费 | 老会员 | 下午 |
3 | 34c2dc93 | 260.00 | 植村秀三色眼影/眼线液/眉笔. | 260.00 | 青年 | 秋季 | 13 | 女 | 低消费 | 老会员 | 中午 |
4 | 828aa9b4 | 1288.00 | 酷彩F件 | 1288.00 | 老年人 | 春季 | 16 | 女 | 中等消费 | 老会员 | 早上 |
#删去无关列
im=im.drop(['je','jf','入会时长'], axis=1)
im.head()
kh | spmc | 年龄阶层 | 季节 | 性别 | 消费水平 | 新老会员 | 购物时间段 | |
---|---|---|---|---|---|---|---|---|
0 | a37cc182 | WMF D无 | 中年人 | 秋季 | 女 | 低消费 | 老会员 | 早上 |
1 | a37cc182 | WMF D无 | 中年人 | 秋季 | 女 | 高消费 | 老会员 | 早上 |
2 | a37cc182 | 双立人商品 F无 | 中年人 | 冬季 | 女 | 中等消费 | 老会员 | 下午 |
3 | 34c2dc93 | 植村秀三色眼影/眼线液/眉笔. | 青年 | 秋季 | 女 | 低消费 | 老会员 | 中午 |
4 | 828aa9b4 | 酷彩F件 | 老年人 | 春季 | 女 | 中等消费 | 老会员 | 早上 |
#任务3.4:建立用户画像
#整合所有标签信息
#1.文字呈现每个用户的画像结果(#会员卡号、性别、偏好品类、近期消费次数)
#2.可视化方式呈现
#对2bde7c95
bd=im[(im['kh']=='2bde7c95')]
bd=bd.drop(['kh'], axis=1)
bd.head()
bd.to_csv('D:/test.csv')
# import matplotlib.pyplot as plt
# mid_je = im
# mid_je = mid_je.reset_index()
# dic = {x[0]:x[1] for x in mid_je.loc[:,:].values}
# wc= wordcloud.WordCloud(scale=16, font_path='simhei.ttf',
# background_color='white', max_words=100,colormap="coolwarm")
# X = wc.generate_from_frequencies(dic)
# plt.axis('off')
# plt.imshow(X)
# wordcloud = WordCloud(background_color='white',scale=1.5).generate(bd)
# #显示词云图片
# plt.imshow(wordcloud)
# plt.axis('off')
# plt.show()
#导入wordcloud模块和matplotlib模块
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from scipy.misc import imread
#读取一个txt文件
with open("D:/test.csv" ,encoding="utf-8")as file:
#读取文本内容
text=file.read()
#读入背景图片
# bg_pic = imread('D:/3.png')
#生成词云
wordcloud = WordCloud(background_color='white',font_path='simhei.ttf',scale=1.5,
max_words=100,colormap="coolwarm").generate(text)
# image_colors = ImageColorGenerator(bg_pic)
#显示词云图片
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
#保存图片
# wordcloud.to_file('test.jpg')
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-P9HCikf3-1584007168794)(output_39_0.png)]
c.head()
kh | csrq | xb | djsj | dtime | spbm | sl | sj | je | spmc | ... | gzmc | year | age | 年龄阶层 | month | 季节 | 入会时长 | 性别 | 消费水平 | 新老会员 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | a37cc182 | 1967-02-17 | 0.0 | 2004-12-31 21:24:34.216 | 2016-09-25 10:49:14.016 | d62a69e3 | 1 | 499.0 | 297.07 | WMF D无 | ... | WMF 柜 | 1967 | 53.0 | 中年人 | 9 | 秋季 | 16 | 女 | 低消费 | 老会员 |
1 | a37cc182 | 1967-02-17 | 0.0 | 2004-12-31 21:24:34.216 | 2016-09-25 10:49:14.016 | d62a69e3 | 1 | 12288.0 | 11089.93 | WMF D无 | ... | WMF 柜 | 1967 | 53.0 | 中年人 | 9 | 秋季 | 16 | 女 | 高消费 | 老会员 |
2 | a37cc182 | 1967-02-17 | 0.0 | 2004-12-31 21:24:34.216 | 2017-02-28 17:17:35.533 | 252403ef | 1 | 598.0 | 598.00 | 双立人商品 F无 | ... | 双立人柜 | 1967 | 53.0 | 中年人 | 2 | 冬季 | 16 | 女 | 中等消费 | 老会员 |
3 | 34c2dc93 | 1986-07-18 | 0.0 | 2007-04-19 00:00:00.000 | 2017-11-13 13:36:45.656 | 4708e0bb | 1 | 260.0 | 260.00 | 植村秀三色眼影/眼线液/眉笔. | ... | Shu Uemura | 1986 | 34.0 | 青年 | 11 | 秋季 | 13 | 女 | 低消费 | 老会员 |
4 | 828aa9b4 | 1951-12-29 | 0.0 | 2004-09-24 16:39:08.716 | 2017-05-17 11:30:39.080 | a4809307 | 1 | 1288.0 | 1288.00 | 酷彩F件 | ... | Le Creuset(酷彩) | 1951 | 69.0 | 老年人 | 5 | 春季 | 16 | 女 | 中等消费 | 老会员 |
5 rows × 24 columns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
km=c.drop(['csrq','xb','djsj','dtime','spbm','sl','sj','spmc',
'syjh','gzbm','gzmc','year','age','month','jf',
'年龄阶层','季节','性别','消费水平','新老会员'], axis=1)
km_cp=km.copy()
km.head()
kh | je | djh | 入会时长 | |
---|---|---|---|---|
0 | a37cc182 | 297.07 | 7cd8 | 16 |
1 | a37cc182 | 11089.93 | 7cd8 | 16 |
2 | a37cc182 | 598.00 | ed0a | 16 |
3 | 34c2dc93 | 260.00 | c3f4 | 13 |
4 | 828aa9b4 | 1288.00 | 95b0 | 16 |
#入会时长 (Duration)
#消费频率 (Frequency)
#消费金额 (Monetary)
#计算最近一次消费
D=km_cp[['kh','入会时长']]
#计算消费次数
Fre=km_cp[['kh','djh']].groupby('kh').count().reset_index()
#用户个人总消费金额
Mone=km_cp[['kh','je']].groupby('kh').sum().reset_index()
#合并数据集
km_1=pd.merge(D,Fre,on='kh',how='inner')
km_2=pd.merge(km_1,Mone,on='kh',how='inner')
km_2=km_2.rename(columns={'kh':'卡号','djh':'消费频率','je':'消费金额'})
km_2.describe()
入会时长 | 消费频率 | 消费金额 | |
---|---|---|---|
count | 445894.000000 | 445894.000000 | 4.458940e+05 |
mean | 6.322101 | 85.904888 | 1.172853e+05 |
std | 2.989830 | 262.171340 | 3.098385e+05 |
min | 2.000000 | 1.000000 | 0.000000e+00 |
25% | 4.000000 | 11.000000 | 1.071100e+04 |
50% | 5.000000 | 32.000000 | 3.541280e+04 |
75% | 8.000000 | 81.000000 | 1.081595e+05 |
max | 18.000000 | 3039.000000 | 3.257258e+06 |
#K-means模型训练
km_future=km_2[['入会时长','消费频率','消费金额']]
km_2_SC=StandardScaler().fit_transform(data_future)
KMeans_model=KMeans(n_clusters=3)
fit_model=KMeans_model.fit(data_b_SC)
print(KMeans_model.cluster_centers_)
[[ 1.30518985 -0.10829497 -0.13086107]
[-1.11113518 11.26400174 10.13423447]
[-0.54919514 -0.06399474 -0.04323129]]
#K-means聚类雷达图绘制
angles=np.linspace(0,2*np.pi,3,endpoint=False)
angles=np.concatenate((angles,[angles[0]]))
centers=KMeans_model.cluster_centers_
plt_data=np.concatenate((centers,centers[:,[0]]),axis=1)
label=['Duration','Frequency','Monetary']
fig=plt.figure(figsize=(6,6))
ax=fig.add_subplot(111,polar=True)
for i in range(len(plt_data)):
ax.plot(angles,plt_data[i],'o-',label=label[i])
ax.set_thetagrids(angles*180/np.pi,label)
plt.legend(bbox_to_anchor=(0.8,1.15),ncol=3)
plt.show()
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-IFPDvVQF-1584007168796)(output_44_0.png)]
# from sklearn.preprocessing import StandardScaler # 导入数据标准化模块
# from sklearn.cluster import KMeans # 导入K-Means聚类模块
# from sklearn.datasets import load_iris # 导入鸢尾花数据
# iris = load_iris()
# iris_data = iris['data']
# iris_target = iris['target']
# iris_names = iris['feature_names']
# sc = StandardScaler()
# iris_data = sc.fit_transform(iris_data) # 数据标准化
# model = KMeans(n_clusters=3,random_state=0,max_iter=500) # 构建聚类模块
# fit_model = model.fit(iris_data) # 训练聚类模块
# #输出聚类中心
# print('聚类中心\n',model.cluster_centers_)
# print('类别\n',pd.Series(model.labels_).value_counts())
# angles = np.linspace(0,2*np.pi,4,endpoint=False) # 将圆根据标签的个数等比分
# angles = np.concatenate((angles,[angles[0]])) # 闭合
# centers = model.cluster_centers_ # 获取聚类中心数据
# plot_data = np.concatenate((centers,centers[:,[0]]),axis=1)
# label = iris_names # 设置标签
# fig = plt.figure(figsize=(6,6))
# ax = fig.add_subplot(111,polar = True)
# for i in range(len(plot_data)):
# ax.plot(angles, plot_data[i], 'o-', label='聚类群体'+str(i+1))
# ax.set_thetagrids(angles*180/np.pi, label)
# plt.legend(bbox_to_anchor=(0.8, 1.15),ncol=3) # 设置图例的位置
# plt.show()