Project

秦安怡
2023-12-01
import pandas as pd
c1=pd.read_excel('C:/Users/50502/Desktop/Python项目/02-百货商场data/data/cumcm2018c1.xlsx')
c2=pd.read_csv('C:/Users/50502/Desktop/Python项目/02-百货商场data/data/cumcm2018c2.csv')
D:\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3057: DtypeWarning: Columns (0,11) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
c1.head()
khcsrqxbdjsj
0c68b20b42002-11-02 00:00:000.02013-05-11 00:00:00.000
11ca15332NaN0.02004-11-04 16:31:52.436
2a37cc1821967-02-17 00:00:000.02004-12-31 21:24:34.216
32ab885391982-06-01 00:00:000.02010-11-19 00:00:00.000
4b4c772691964-02-05 00:00:000.02007-12-14 00:00:00.000
c2.head()
khdtimespbmslsjjespmcjfsyjhdjhgzbmgzmc
01be1e3fe2015-01-01 00:05:41.593f09c93031290.0270.20兰芝化妆品正价瓶270.20625bb8077.0兰芝柜
11be1e3fe2015-01-01 00:05:41.593f09c93031325.0302.80兰芝化妆品正价瓶302.80625bb8077.0兰芝柜
21be1e3fe2015-01-01 00:05:41.593f09c93031195.0181.80兰芝化妆品正价瓶181.80625bb8077.0兰芝柜
31be1e3fe2015-01-01 00:05:41.593f09c93031270.0251.55兰芝化妆品正价瓶251.55625bb8077.0兰芝柜
41be1e3fe2015-01-01 00:05:41.593f09c93032245.0456.55兰芝化妆品正价瓶456.55625bb8077.0兰芝柜
#去重
c1=c1.drop_duplicates()
c2=c2.drop_duplicates()
#合并数据集
c = pd.merge(c1,c2)
c.head()
khcsrqxbdjsjdtimespbmslsjjespmcjfsyjhdjhgzbmgzmc
0a37cc1821967-02-17 00:00:000.02004-12-31 21:24:34.2162016-09-25 10:49:14.016d62a69e31499.0297.07WMF D无297.081017cd87296.0WMF 柜
1a37cc1821967-02-17 00:00:000.02004-12-31 21:24:34.2162016-09-25 10:49:14.016d62a69e3112288.011089.93WMF D无11089.921017cd87296.0WMF 柜
2a37cc1821967-02-17 00:00:000.02004-12-31 21:24:34.2162017-02-28 17:17:35.533252403ef1598.0598.00双立人商品 F无598.00102ed0a7242.0双立人柜
334c2dc931986-07-18 00:00:000.02007-04-19 00:00:00.0002017-11-13 13:36:45.6564708e0bb1260.0260.00植村秀三色眼影/眼线液/眉笔.260.0097c3f48156.0Shu Uemura
4828aa9b41951-12-29 00:00:000.02004-09-24 16:39:08.7162017-05-17 11:30:39.080a480930711288.01288.00酷彩F件1288.0010295b07300.0Le Creuset(酷彩)
# 查看数据的形状
print("数据的形状:", c.shape)
# 查看每列数据的非空值的个数和数据类型
print("每列数据的非空值的个数和数据类型:\n", c.info())
# 查看每列数据的描述统计信息
print("数据的描述统计信息:\n", c.describe())
# 查看每列数据的空值个数
print("每列数据的空值个数:\n", c.isnull().sum())
数据的形状: (506006, 15)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 506006 entries, 0 to 506005
Data columns (total 15 columns):
kh       506006 non-null object
csrq     482173 non-null object
xb       495568 non-null float64
djsj     474547 non-null datetime64[ns]
dtime    506006 non-null object
spbm     506006 non-null object
sl       506006 non-null int64
sj       506006 non-null float64
je       506006 non-null float64
spmc     506006 non-null object
jf       506006 non-null float64
syjh     506006 non-null int64
djh      506006 non-null object
gzbm     506006 non-null float64
gzmc     492463 non-null object
dtypes: datetime64[ns](1), float64(5), int64(2), object(7)
memory usage: 61.8+ MB
每列数据的非空值的个数和数据类型:
 None
数据的描述统计信息:
                   xb             sl            sj            je  \
count  495568.000000  506006.000000  5.060060e+05  5.060060e+05   
mean        0.098372       1.054300  1.430813e+03  1.356862e+03   
std         0.297817       2.366001  3.695847e+03  3.706632e+03   
min         0.000000     -30.000000  1.000000e-01 -6.712575e+05   
25%         0.000000       1.000000  3.400000e+02  3.200000e+02   
50%         0.000000       1.000000  7.200000e+02  6.960000e+02   
75%         0.000000       1.000000  1.622000e+03  1.577430e+03   
max         1.000000    1492.000000  1.342515e+06  1.342515e+06   

                  jf           syjh           gzbm  
count  506006.000000  506006.000000  506006.000000  
mean     1030.375453     111.340160    6296.818781  
std      2125.932871      82.427647    2363.259002  
min   -201616.000000       1.000000       0.000000  
25%       145.000000      43.000000    4334.000000  
50%       500.000000      97.000000    8064.000000  
75%      1270.000000     170.000000    8121.000000  
max    268503.000000     320.000000    9005.000000  
每列数据的空值个数:
 kh           0
csrq     23833
xb       10438
djsj     31459
dtime        0
spbm         0
sl           0
sj           0
je           0
spmc         0
jf           0
syjh         0
djh          0
gzbm         0
gzmc     13543
dtype: int64
c.describe()
xbslsjjejfsyjhgzbm
count495568.000000506006.0000005.060060e+055.060060e+05506006.000000506006.000000506006.000000
mean0.0983721.0543001.430813e+031.356862e+031030.375453111.3401606296.818781
std0.2978172.3660013.695847e+033.706632e+032125.93287182.4276472363.259002
min0.000000-30.0000001.000000e-01-6.712575e+05-201616.0000001.0000000.000000
25%0.0000001.0000003.400000e+023.200000e+02145.00000043.0000004334.000000
50%0.0000001.0000007.200000e+026.960000e+02500.00000097.0000008064.000000
75%0.0000001.0000001.622000e+031.577430e+031270.000000170.0000008121.000000
max1.0000001492.0000001.342515e+061.342515e+06268503.000000320.0000009005.000000
c.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 506006 entries, 0 to 506005
Data columns (total 15 columns):
kh       506006 non-null object
csrq     482173 non-null object
xb       495568 non-null float64
djsj     474547 non-null datetime64[ns]
dtime    506006 non-null object
spbm     506006 non-null object
sl       506006 non-null int64
sj       506006 non-null float64
je       506006 non-null float64
spmc     506006 non-null object
jf       506006 non-null float64
syjh     506006 non-null int64
djh      506006 non-null object
gzbm     506006 non-null float64
gzmc     492463 non-null object
dtypes: datetime64[ns](1), float64(5), int64(2), object(7)
memory usage: 61.8+ MB
# 查看销售日期列数据中每个值的个数
c.dtime.value_counts()
2017-11-25 15:36:23.626    54
2015-04-25 11:16:56.140    45
2015-03-02 12:12:11.923    38
2015-04-26 13:19:38.080    37
2016-06-24 14:27:57.830    37
2016-09-23 14:50:44.063    36
2015-04-24 18:47:34.390    35
2015-06-03 16:07:30.813    34
2016-12-31 16:45:42.343    33
2015-04-26 17:40:18.063    33
2015-04-24 17:03:14.423    32
2017-11-26 16:02:26.936    32
2017-08-27 17:29:24.376    31
2016-11-25 12:19:11.720    31
2016-08-27 12:13:19.203    31
2015-04-26 11:20:21.813    30
2015-01-10 17:13:34.343    30
2015-04-24 15:17:28.390    30
2016-09-23 18:09:19.796    30
2015-08-17 13:44:25.686    30
2017-05-12 16:14:14.360    29
2015-01-01 11:52:18.890    29
2017-09-23 15:48:34.186    29
2017-10-08 16:41:18.283    28
2015-04-24 17:17:20.110    27
2017-08-26 15:06:12.046    27
2015-04-25 15:24:03.436    27
2017-11-26 15:05:00.516    27
2015-02-16 17:28:15.906    26
2017-05-12 18:59:47.906    26
                           ..
2016-11-04 14:03:26.453     1
2016-03-30 16:48:55.110     1
2017-10-19 17:27:16.126     1
2016-05-24 15:16:28.936     1
2017-05-23 14:34:51.546     1
2015-04-24 16:33:23.423     1
2017-01-19 17:12:30.610     1
2015-06-24 12:10:02.813     1
2016-07-26 19:34:21.330     1
2015-04-02 20:47:07.453     1
2016-01-09 16:42:14.156     1
2017-05-08 13:52:32.266     1
2017-06-18 16:39:42.436     1
2015-01-16 16:54:14.063     1
2017-05-05 13:18:11.686     1
2017-02-03 18:28:31.203     1
2017-05-19 12:08:02.500     1
2017-09-17 15:24:35.500     1
2017-08-17 12:41:08.626     1
2015-07-13 11:04:57.796     1
2015-04-30 17:17:53.110     1
2017-07-05 18:30:27.813     1
2016-06-28 12:15:22.093     1
2015-07-27 12:07:34.093     1
2015-05-02 19:00:01.626     1
2017-10-13 13:44:54.173     1
2017-09-22 17:02:49.360     1
2015-02-01 17:17:37.580     1
2017-04-08 12:51:00.110     1
2016-06-28 11:58:35.296     1
Name: dtime, Length: 264022, dtype: int64
## 查看销售日期列数据中每年的个数
import datetime as dt
c['dtime']=pd.to_datetime(c['dtime'],errors='coerce')
c.dtime.dt.year.value_counts()
2017    231373
2016    146958
2015    123852
2018      3823
Name: dtime, dtype: int64
# 数据去重
new_c = c.drop_duplicates()
print(new_c.shape)
(506006, 15)
# 去除数据中的空值
c = new_c.dropna()
c.shape
(445894, 15)
c.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 445894 entries, 0 to 506005
Data columns (total 15 columns):
kh       445894 non-null object
csrq     445894 non-null object
xb       445894 non-null float64
djsj     445894 non-null datetime64[ns]
dtime    445894 non-null datetime64[ns]
spbm     445894 non-null object
sl       445894 non-null int64
sj       445894 non-null float64
je       445894 non-null float64
spmc     445894 non-null object
jf       445894 non-null float64
syjh     445894 non-null int64
djh      445894 non-null object
gzbm     445894 non-null float64
gzmc     445894 non-null object
dtypes: datetime64[ns](2), float64(5), int64(2), object(6)
memory usage: 54.4+ MB
#定义年龄0-44为青年,45-59为中年人,60以上为老年人
#在c表中csrq
import datetime as dt
#提取年数
c['csrq']=pd.to_datetime(c['csrq'],errors='coerce')
c['year'] = c['csrq'].dt.year
#年龄计算
now_year = dt.datetime.now().year
c['age'] = now_year - c['year']
#年龄阶层
bins = [0,44,59,120] 
labels = ['青年 ','中年人', '老年人'] 
c['年龄阶层'] = pd.cut(c['age'], bins, labels=labels)
#季节
c['month']=c['dtime'].dt.month
bins = [0,2.9,5,8,11,12.1] 
labels = ['冬季 ','春季', '夏季', '秋季','冬季']
#labels = ['Winter ','Spring', 'Summer', 'Autumn','Winter'] 
c['季节'] = pd.cut(c['month'], bins, labels=labels)
#入会时长数据定义、处理与描述
c['djsj']=pd.to_datetime(c['djsj'],errors='coerce')
c['入会时长']=dt.datetime.now().year-c['djsj'].dt.year
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:15: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:19: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:21: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:22: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
c.head()
khcsrqxbdjsjdtimespbmslsjjespmc...syjhdjhgzbmgzmcyearage年龄阶层month季节入会时长
0a37cc1821967-02-170.02004-12-31 21:24:34.2162016-09-25 10:49:14.016d62a69e31499.0297.07WMF D无...1017cd87296.0WMF 柜196753中年人9秋季16
1a37cc1821967-02-170.02004-12-31 21:24:34.2162016-09-25 10:49:14.016d62a69e3112288.011089.93WMF D无...1017cd87296.0WMF 柜196753中年人9秋季16
2a37cc1821967-02-170.02004-12-31 21:24:34.2162017-02-28 17:17:35.533252403ef1598.0598.00双立人商品 F无...102ed0a7242.0双立人柜196753中年人2冬季16
334c2dc931986-07-180.02007-04-19 00:00:00.0002017-11-13 13:36:45.6564708e0bb1260.0260.00植村秀三色眼影/眼线液/眉笔....97c3f48156.0Shu Uemura198634青年11秋季13
4828aa9b41951-12-290.02004-09-24 16:39:08.7162017-05-17 11:30:39.080a480930711288.01288.00酷彩F件...10295b07300.0Le Creuset(酷彩)195169老年人5春季16

5 rows × 21 columns

c.age.describe()
count    445894.000000
mean         63.031622
std          62.198657
min           3.000000
25%          38.000000
50%          46.000000
75%          53.000000
max         267.000000
Name: age, dtype: float64
#去除不符合逻辑的值:删除年龄大于>120的行(认为会员最高年龄不超过120随)
c['age'] = c['age'].drop(c['age'][c['age'] > 120].index)
#去除不符合逻辑的值:删除消费金额小于0的行
c['je']=c['je'].drop(c['je'][c['je'] < 0].index)
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
# c.rename(
#     columns={'kh':'卡号', 'dtime':'消费时间', 'spbm':'商品编码', 'sl':'数量',
#              'sj':'售价', 'je':'实际消费金额', 'spmc':'商品名称', 'jf':'积分',
#              'syjh':'收银机号', 'djh':'单据号', 'gzbm':'柜组编码',
#              'gzmc':'柜组名称'}, inplace = True)
import numpy as np
c.groupby(by=["spmc"])["je"].agg({"各类商品销售金额":np.sum})
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: FutureWarning: using a dict on a Series for aggregation
is deprecated and will be removed in a future version
各类商品销售金额
spmc
.雷达钟表无1370300.52
23区女装系列A件1655993.77
23区女装系列B.5件253656.22
23区女装系列B件280106.95
23区女装系列C件258243.90
23区女装系列D件63571.14
23区女装系列E件105316.01
23区女装系列F件1426.00
23区女装系列正价件14810.40
3S 系列B无1599.00
3S 系列C无1399.00
5+E285.00
5+E.335.00
5+服饰A件1521.00
5+服饰C件978.00
5+服饰E件445.00
6F-NIKEKIDS正价件199.00
6F欣欣玩具正价个713.00
8848 正价个270080.00
A.O.史密斯 正价件146028.04
ABLE JEANS正价件1399.00
AD 正价无799.00
ADSE 正价无699.00
ADSE 系列A无508.00
AD集合店正价1996.00
AO2F件5615.00
AO2正价件1434.00
AOJO 正价副549.00
APM D无48688.00
APM 正价无2364639.24
......
香奈儿限量35ml邂逅35ml6077.00
香奈儿青春光彩保湿粉凝霜.16182.50
香奈儿青春光彩柔润粉芯.22616.94
香奈儿青春光彩柔润粉饼13g281251.06
香奈儿青春光彩水润粉底液30ml2899.16
香奈儿魅力润体乳200ml1010.00
香水 正价无95674.49
马天奴 A件2788.00
骆驼新B*670.00
高美高手袋系列A个14881.11
高美高手袋系列B个51714.90
高美高手袋系列F个3397.00
高美高手袋系列正价个2088.00
高美高正价*1888.00
高美高特卖个5568.00
高美高系列正价个479329.78
魅可唇膏支510.00
魅可持久防水眼线笔支200.00
魅可新式卷笔刀支40.00
魅可时尚焦点小眼影(片)支115.00
魅可焦点小眼影支155.00
黄金金条件37978.20
黄金饰品克105375.00
黛安芬内衣系列3件36.00
黛安芬内衣系列B件47239.81
黛安芬内衣系列C件24655.99
黛安芬内衣系列E件2124.00
黛安芬内衣系列F件2647.92
黛安芬内衣系列件475907.51
黛安芬内衣系列员购件8669.00

5936 rows × 1 columns

je = c.groupby(by=["spmc"])["je"].agg({"sum":np.sum})[0:5]
#index = range(len(je))
index = range(5)
plt.bar(index, je["sum"], color="dodgerblue")
plt.xticks(index, je.index)
plt.show()
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: using a dict on a Series for aggregation
is deprecated and will be removed in a future version
  """Entry point for launching an IPython kernel.

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-59b1cBLf-1584007168716)(output_18_1.png)]

import matplotlib.pyplot as plt
qn=c.age[(c.age < 45)].shape[0]
zn=c.age[(c.age < 60)&(c.age >= 45)].shape[0]
ln=c.age[(c.age >= 60)].shape[0]
import matplotlib.pyplot as plt
labels=('Young','Middle-aged','Elderly')
sizes=(qn,zn,ln)
colors=['lightgreen','gold','lightskyblue']
explode=0,0,0.1
plt.pie(sizes,explode=explode,labels=labels,
        colors=colors,autopct='%1.1f%%',shadow=True,startangle=50)
plt.axis('equal')
plt.title('Distribution of member age')
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-XOo9KI81-1584007168721)(output_19_0.png)]

#性别人数比例和不同性别消费金额比例
#性别,0女,1男
#性别比例图
#年龄占比图
import matplotlib.pyplot as plt
Female=c['xb'][(c['xb'] == 0)].shape[0]
Male=c['xb'][(c['xb'] == 1)].shape[0]
labels=('Female','Male')
sizes=(Female,Male)
colors=['lightgreen','gold']
explode=0,0.1
plt.pie(sizes,explode=explode,labels=labels,
        colors=colors,autopct='%1.1f%%',shadow=True,startangle=50)
plt.axis('equal')
plt.title('Distribution of member sex')
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-XPB7UnkN-1584007168723)(output_20_0.png)]

#不同年龄阶层消费总额占比
qne = c[(c.age < 45)]['je'].sum()
zne = c[(c.age < 60)&(c.age >= 45)]['je'].sum()
lne = c[(c.age >= 60)]['je'].sum()
labels=('Young','Middle-aged','Elderly')
sizes=(qn,zn,ln)
colors=['lightgreen','gold','lightskyblue']
explode=0,0,0.1
plt.pie(sizes,explode=explode,labels=labels,
        colors=colors,autopct='%1.1f%%',shadow=True,startangle=50)
plt.axis('equal')
plt.title('Distribution of different age member''s payment')
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-muUEQQEd-1584007168727)(output_21_0.png)]

#不同性别消费金额占比
import matplotlib.pyplot as plt
Female=c[(c['xb'] == 0)]['je'].sum()
Male=c[(c['xb'] == 1)]['je'].sum()
labels=('Female','Male')
sizes=(Female,Male)
colors=['lightskyblue','gold']
explode=0,0.1
plt.pie(sizes,explode=explode,labels=labels,
        colors=colors,autopct='%1.1f%%',shadow=True,startangle=50)
plt.axis('equal')
plt.title('Distribution of different members sex payment')
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-AyHHa9WB-1584007168729)(output_22_0.png)]

#会员订单数=会员字段非空且单据号字段非空的行数;(剔除不符合逻辑的消费金额小于零值)
hy=c2[(c2['kh'].isnull()==False)&(c2['djh'].isnull()==False)&(c2['je']>=0)]
#非会员订单数=会员字段为空但单据号字段非空的行数;(剔除不符合逻辑的消费金额小于零值)
fhy=c2[(c2['kh'].isnull()==True)&(c2['djh'].isnull()==False)&(c2['je']>=0)]
# #提取年数
# hy['csrq']=pd.to_datetime(hy['csrq'],errors='coerce')
# hy['year'] = hy['csrq'].dt.year
# fhy['csrq']=pd.to_datetime(fhy['csrq'],errors='coerce')
# fhy['year'] = fhy['csrq'].dt.year
# #年龄计算
# now_year = dt.datetime.now().year
# hy['age'] = now_year - hy['year']
# fhy['age'] = now_year - fhy['year']
# #去除不符合逻辑的值:删除年龄大于>120的行(认为会员最高年龄不超过120随)
# hy['age'] = hy['age'].drop(hy['age'][hy['age'] > 120].index)
# fhy['age'] = fhy['age'].drop(fhy['age'][fhy['age'] > 120].index)
#c2['kh'](c2['kh'].isnull()==True)
num_hy=hy.iloc[:,0].size
num_fhy=fhy.iloc[:,0].size
print (num_hy,num_fhy)
869072 997811
#会员与非会员的订单数饼图
import matplotlib.pyplot as plt
labels=('Member orders','Non-member orders')
sizes=(num_hy,num_fhy)
colors=['lightgreen','gold']
explode=0,0
plt.pie(sizes,explode=explode,labels=labels,
        colors=colors,autopct='%1.1f%%',shadow=True,startangle=50)
plt.axis('equal')
plt.title('Distribution of member orders')
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-Srh33bjN-1584007168731)(output_24_0.png)]

#会员与非会员的消费金额总数饼图
import matplotlib.pyplot as plt
labels=('Member orders','Non-member orders')
sizes=(hy['je'].sum(),fhy['je'].sum())
colors=['lightblue','gold']
explode=0,0
plt.pie(sizes,explode=explode,labels=labels,
        colors=colors,autopct='%1.1f%%',shadow=True,startangle=50)
plt.axis('equal')
plt.title('Distribution of member orders')
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-jERIprf7-1584007168740)(output_25_0.png)]

#2015-2018各年会员消费总额
#对hy表格增加年、月两列,删除除je、year、month的列
hy['dtime']=pd.to_datetime(hy['dtime'],errors='coerce')
hyy=hy
hyy['year']=hyy['dtime'].dt.year
hyy['month']=hyy['dtime'].dt.month
hyy=hyy.drop(['kh', 'dtime','spbm','sl','sj','spmc','jf','syjh','djh','gzbm','gzmc'], axis=1)
#会员2015-2018各年会员消费总额
import numpy as np
je_counts=hyy.groupby(by=["year"])["je"].agg({"各年销售金额":np.sum})
#饼图
plt.pie(je_counts, labels=je_counts.index,explode=[0.1,0,0.1,0.1],
        colors=['lightblue','gold','lightgreen','lightcoral'], 
        autopct='%.2f%%',shadow=True,startangle=50)
plt.axis('equal')
plt.title('Distribution of yearly payment')
plt.show()
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:10: FutureWarning: using a dict on a Series for aggregation
is deprecated and will be removed in a future version
  # Remove the CWD from sys.path while we load stuff.
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:14: MatplotlibDeprecationWarning: Non-1D inputs to pie() are currently squeeze()d, but this behavior is deprecated since 3.1 and will be removed in 3.3; pass a 1D array instead.

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-tUu3vx9P-1584007168751)(output_26_1.png)]

#2015-2018年每个月的消费金额
c['dtime']=pd.to_datetime(c['dtime'],errors='coerce')
for i in range(2015, 2019):
    monthly = []
    for j in range(0,13):
        monthly.append(c[(c['dtime'].dt.year == i) & (c['dtime'].dt.month == j)]['je'].sum())
    plt.plot(range(0,len(monthly)), monthly,color=(0.4, 0.8, 0.6))
    plt.xlabel("Month")
    plt.ylabel("Amount")
    plt.grid(True)
    plt.title('The consumption in '+str(i))
    plt.show()    
# plt.plot(sl_counts.index, sl_counts, "-", color=(0.4, 0.8, 0.6))
# plt.xlabel("日期")
# plt.xticks(rotation=15)
# plt.ylabel("销量")
# plt.grid(True)
# plt.title("折线图")
# plt.show()
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-IGATV2CI-1584007168765)(output_27_1.png)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-yrT9weJs-1584007168776)(output_27_2.png)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-8O0NO7gq-1584007168780)(output_27_3.png)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-uJ5IRmQ4-1584007168786)(output_27_4.png)]

#各时段消费人数
zs=c['dtime'][(c['dtime'].dt.hour >= 5)&(c['dtime'].dt.hour < 11)].shape[0]
zw=c['dtime'][(c['dtime'].dt.hour >= 11)&(c['dtime'].dt.hour < 14)].shape[0]
xw=c['dtime'][(c['dtime'].dt.hour >= 14)&(c['dtime'].dt.hour < 19)].shape[0]
ws=c['dtime'][(c['dtime'].dt.hour >= 19)&(c['dtime'].dt.hour < 24)].shape[0]
lc=c['dtime'][(c['dtime'].dt.hour >= 0)&(c['dtime'].dt.hour < 5)].shape[0]
print ('早上:05:00:00~10:59:59订单数量:%d'%zs)
print ('中午:11:00:00~13:59:59订单数量:%d'%zw)
print ('下午:14:00:00~18:59:59订单数量:%d'%xw)
print ('晚上:19:00:00~23:59:59订单数量:%d'%ws)
print ('凌晨:00:00:00~04:59:59订单数量:%d'%lc)
早上:05:00:00~10:59:59订单数量:17833
中午:11:00:00~13:59:59订单数量:104883
下午:14:00:00~18:59:59订单数量:223950
晚上:19:00:00~23:59:59订单数量:99136
凌晨:00:00:00~04:59:59订单数量:92
#各时段人数可视化图
import matplotlib.pyplot as plt
labels=('Morning','Noon','Early morning','Afternoon','Evening')
sizes=(zs,zw,lc,xw,ws)
explode=[0,0.1,0,0.1,0.1]
plt.pie(sizes,explode=explode,labels=labels,colors=['lightblue','gold','lightgreen','lightcoral','lightgreen'],
        autopct='%1.1f%%',shadow=True,startangle=50)
plt.axis('equal')
plt.title('Distribution of member time')
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-N8C60Yuv-1584007168789)(output_29_0.png)]

#春季3、4、5月;夏季6、7、8月;秋季9、10、11月;冬季12、1、2月
Spring=c['dtime'][(c['dtime'].dt.month >= 3)&(c['dtime'].dt.month <= 5)].shape[0]
Summer=c['dtime'][(c['dtime'].dt.month >= 6)&(c['dtime'].dt.month <= 8)].shape[0]
Autumn=c['dtime'][(c['dtime'].dt.month >= 9)&(c['dtime'].dt.month <= 11)].shape[0]
Winter=c['dtime'][(c['dtime'].dt.month >= 12)&(c['dtime'].dt.month <= 2)].shape[0]
print ('春季3、4、5月订单数量:%d'%Spring)
print ('夏季6、7、8月订单数量:%d'%Summer)
print ('秋季9、10、11月订单数量:%d'%Autumn)
print ('冬季12、1、2月订单数量:%d'%Winter)
春季3、4、5月订单数量:132681
夏季6、7、8月订单数量:118389
秋季9、10、11月订单数量:100197
冬季12、1、2月订单数量:0
#各季节会员消费人数
import matplotlib.pyplot as plt
labels=('Spring','Summer','Autumn','Winter')
sizes=(Spring,Summer,Autumn,Winter)
explode=[0.1,0,0.1,0]
colors=['lightgreen','gold','lightskyblue','lightcoral']
plt.pie(sizes,explode=explode,labels=labels,colors=colors
        ,autopct='%.2f%%',shadow=True,startangle=50)
plt.axis('equal')
plt.title('Distribution of season')
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-EwYd2g5S-1584007168790)(output_31_0.png)]

c.head()
khcsrqxbdjsjdtimespbmslsjjespmc...syjhdjhgzbmgzmcyearage年龄阶层month季节入会时长
0a37cc1821967-02-170.02004-12-31 21:24:34.2162016-09-25 10:49:14.016d62a69e31499.0297.07WMF D无...1017cd87296.0WMF 柜196753.0中年人9秋季16
1a37cc1821967-02-170.02004-12-31 21:24:34.2162016-09-25 10:49:14.016d62a69e3112288.011089.93WMF D无...1017cd87296.0WMF 柜196753.0中年人9秋季16
2a37cc1821967-02-170.02004-12-31 21:24:34.2162017-02-28 17:17:35.533252403ef1598.0598.00双立人商品 F无...102ed0a7242.0双立人柜196753.0中年人2冬季16
334c2dc931986-07-180.02007-04-19 00:00:00.0002017-11-13 13:36:45.6564708e0bb1260.0260.00植村秀三色眼影/眼线液/眉笔....97c3f48156.0Shu Uemura198634.0青年11秋季13
4828aa9b41951-12-290.02004-09-24 16:39:08.7162017-05-17 11:30:39.080a480930711288.01288.00酷彩F件...10295b07300.0Le Creuset(酷彩)195169.0老年人5春季16

5 rows × 21 columns

##任务3.1:构建会员用户“基本”特征标签
#固有基础属性标签:性别、年龄阶层、入会时长
c['性别']=c['xb'].map({1.0:'男',0.0:'女'})
# c['年龄']=aaa.replace("黑色","黄色")
im=c.drop(['csrq','xb','djsj','dtime','spbm','sl','sj',
              'syjh','djh','gzbm','gzmc','year','age','month'], axis=1)
im.head()
khjespmcjf年龄阶层季节入会时长性别消费水平新老会员
0a37cc182297.07WMF D无297.08中年人秋季16低消费老会员
1a37cc18211089.93WMF D无11089.92中年人秋季16高消费老会员
2a37cc182598.00双立人商品 F无598.00中年人冬季16中等消费老会员
334c2dc93260.00植村秀三色眼影/眼线液/眉笔.260.00青年秋季13低消费老会员
4828aa9b41288.00酷彩F件1288.00老年人春季16中等消费老会员
#任务3.2:构建会员用户“业务”特征标签
#基础性息处理后得到标签:消费水平(300以下低消费;300-1500中等消费;1500以上高消费)
#入会时长:新老会员;消费水平:低中高;有价值的用户信息
#消费金额数据描述
im['je'].describe()
im['入会时长'].describe()
count    445894.000000
mean          6.322101
std           2.989830
min           2.000000
25%           4.000000
50%           5.000000
75%           8.000000
max          18.000000
Name: 入会时长, dtype: float64
im['消费水平'] = pd.cut(im['je'], bins=[0,300,1500,9999999], labels=['低消费','中等消费', '高消费'] )
im['新老会员'] = pd.cut(im['入会时长'], bins=[0,5,999], labels=['新会员','老会员'] )
im.head()
khjejf年龄阶层季节入会时长性别消费水平新老会员
0a37cc182297.07297.08中年人秋季16低消费老会员
1a37cc18211089.9311089.92中年人秋季16高消费老会员
2a37cc182598.00598.00中年人冬季16中等消费老会员
334c2dc93260.00260.00青年秋季13低消费老会员
4828aa9b41288.001288.00老年人春季16中等消费老会员
#任务3.3:构建会员用户“偏好”特征标签
#用户行为推测标签:经常购买运动T、裤、鞋为体育标签;购物行为——怀孕标签
#购物时间段、季节、商品类型偏好
im['购物时间段'] = pd.cut(c['dtime'].dt.hour, bins=[0,5,11,14,19,24], labels=['凌晨','早上', '中午','下午','晚上'] )
im.head()
khjespmcjf年龄阶层季节入会时长性别消费水平新老会员购物时间段
0a37cc182297.07WMF D无297.08中年人秋季16低消费老会员早上
1a37cc18211089.93WMF D无11089.92中年人秋季16高消费老会员早上
2a37cc182598.00双立人商品 F无598.00中年人冬季16中等消费老会员下午
334c2dc93260.00植村秀三色眼影/眼线液/眉笔.260.00青年秋季13低消费老会员中午
4828aa9b41288.00酷彩F件1288.00老年人春季16中等消费老会员早上
#删去无关列
im=im.drop(['je','jf','入会时长'], axis=1)
im.head()
khspmc年龄阶层季节性别消费水平新老会员购物时间段
0a37cc182WMF D无中年人秋季低消费老会员早上
1a37cc182WMF D无中年人秋季高消费老会员早上
2a37cc182双立人商品 F无中年人冬季中等消费老会员下午
334c2dc93植村秀三色眼影/眼线液/眉笔.青年秋季低消费老会员中午
4828aa9b4酷彩F件老年人春季中等消费老会员早上
#任务3.4:建立用户画像
#整合所有标签信息
#1.文字呈现每个用户的画像结果(#会员卡号、性别、偏好品类、近期消费次数)
#2.可视化方式呈现
#对2bde7c95
bd=im[(im['kh']=='2bde7c95')]
bd=bd.drop(['kh'], axis=1)
bd.head()
bd.to_csv('D:/test.csv') 


# import matplotlib.pyplot as plt
# mid_je = im
# mid_je = mid_je.reset_index()
# dic = {x[0]:x[1] for x in mid_je.loc[:,:].values}
# wc= wordcloud.WordCloud(scale=16, font_path='simhei.ttf',
#                         background_color='white', max_words=100,colormap="coolwarm")
# X = wc.generate_from_frequencies(dic)
# plt.axis('off')
# plt.imshow(X)
# wordcloud = WordCloud(background_color='white',scale=1.5).generate(bd)
# #显示词云图片
# plt.imshow(wordcloud)
# plt.axis('off')
# plt.show()
#导入wordcloud模块和matplotlib模块
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from scipy.misc import imread
#读取一个txt文件
with open("D:/test.csv" ,encoding="utf-8")as file:
    #读取文本内容
    text=file.read()
#读入背景图片
# bg_pic = imread('D:/3.png')
#生成词云
wordcloud = WordCloud(background_color='white',font_path='simhei.ttf',scale=1.5,
                       max_words=100,colormap="coolwarm").generate(text)
# image_colors = ImageColorGenerator(bg_pic)
#显示词云图片
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
#保存图片
# wordcloud.to_file('test.jpg')

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-P9HCikf3-1584007168794)(output_39_0.png)]

c.head()
khcsrqxbdjsjdtimespbmslsjjespmc...gzmcyearage年龄阶层month季节入会时长性别消费水平新老会员
0a37cc1821967-02-170.02004-12-31 21:24:34.2162016-09-25 10:49:14.016d62a69e31499.0297.07WMF D无...WMF 柜196753.0中年人9秋季16低消费老会员
1a37cc1821967-02-170.02004-12-31 21:24:34.2162016-09-25 10:49:14.016d62a69e3112288.011089.93WMF D无...WMF 柜196753.0中年人9秋季16高消费老会员
2a37cc1821967-02-170.02004-12-31 21:24:34.2162017-02-28 17:17:35.533252403ef1598.0598.00双立人商品 F无...双立人柜196753.0中年人2冬季16中等消费老会员
334c2dc931986-07-180.02007-04-19 00:00:00.0002017-11-13 13:36:45.6564708e0bb1260.0260.00植村秀三色眼影/眼线液/眉笔....Shu Uemura198634.0青年11秋季13低消费老会员
4828aa9b41951-12-290.02004-09-24 16:39:08.7162017-05-17 11:30:39.080a480930711288.01288.00酷彩F件...Le Creuset(酷彩)195169.0老年人5春季16中等消费老会员

5 rows × 24 columns

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
km=c.drop(['csrq','xb','djsj','dtime','spbm','sl','sj','spmc',
              'syjh','gzbm','gzmc','year','age','month','jf',
           '年龄阶层','季节','性别','消费水平','新老会员'], axis=1)
km_cp=km.copy()
km.head()
khjedjh入会时长
0a37cc182297.077cd816
1a37cc18211089.937cd816
2a37cc182598.00ed0a16
334c2dc93260.00c3f413
4828aa9b41288.0095b016
#入会时长 (Duration)
#消费频率 (Frequency)
#消费金额 (Monetary)
#计算最近一次消费
D=km_cp[['kh','入会时长']]
#计算消费次数
Fre=km_cp[['kh','djh']].groupby('kh').count().reset_index()
#用户个人总消费金额
Mone=km_cp[['kh','je']].groupby('kh').sum().reset_index()
#合并数据集
km_1=pd.merge(D,Fre,on='kh',how='inner')
km_2=pd.merge(km_1,Mone,on='kh',how='inner')
km_2=km_2.rename(columns={'kh':'卡号','djh':'消费频率','je':'消费金额'})
km_2.describe()
入会时长消费频率消费金额
count445894.000000445894.0000004.458940e+05
mean6.32210185.9048881.172853e+05
std2.989830262.1713403.098385e+05
min2.0000001.0000000.000000e+00
25%4.00000011.0000001.071100e+04
50%5.00000032.0000003.541280e+04
75%8.00000081.0000001.081595e+05
max18.0000003039.0000003.257258e+06
#K-means模型训练
km_future=km_2[['入会时长','消费频率','消费金额']]
km_2_SC=StandardScaler().fit_transform(data_future)
KMeans_model=KMeans(n_clusters=3)
fit_model=KMeans_model.fit(data_b_SC) 
print(KMeans_model.cluster_centers_)
[[ 1.30518985 -0.10829497 -0.13086107]
 [-1.11113518 11.26400174 10.13423447]
 [-0.54919514 -0.06399474 -0.04323129]]
#K-means聚类雷达图绘制
angles=np.linspace(0,2*np.pi,3,endpoint=False) 
angles=np.concatenate((angles,[angles[0]]))
centers=KMeans_model.cluster_centers_ 
plt_data=np.concatenate((centers,centers[:,[0]]),axis=1)
label=['Duration','Frequency','Monetary']
fig=plt.figure(figsize=(6,6))
ax=fig.add_subplot(111,polar=True)
for i in range(len(plt_data)):
    ax.plot(angles,plt_data[i],'o-',label=label[i])
ax.set_thetagrids(angles*180/np.pi,label)
plt.legend(bbox_to_anchor=(0.8,1.15),ncol=3)
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-IFPDvVQF-1584007168796)(output_44_0.png)]

# from sklearn.preprocessing import StandardScaler # 导入数据标准化模块
# from sklearn.cluster import KMeans # 导入K-Means聚类模块
# from sklearn.datasets import load_iris # 导入鸢尾花数据
# iris = load_iris()
# iris_data = iris['data']
# iris_target = iris['target']
# iris_names = iris['feature_names']
# sc = StandardScaler()
# iris_data = sc.fit_transform(iris_data) # 数据标准化
# model = KMeans(n_clusters=3,random_state=0,max_iter=500) # 构建聚类模块
# fit_model = model.fit(iris_data) # 训练聚类模块
# #输出聚类中心
# print('聚类中心\n',model.cluster_centers_)
# print('类别\n',pd.Series(model.labels_).value_counts())
# angles = np.linspace(0,2*np.pi,4,endpoint=False) # 将圆根据标签的个数等比分
# angles = np.concatenate((angles,[angles[0]])) # 闭合
# centers = model.cluster_centers_  # 获取聚类中心数据
# plot_data = np.concatenate((centers,centers[:,[0]]),axis=1)
# label = iris_names # 设置标签
# fig = plt.figure(figsize=(6,6))
# ax = fig.add_subplot(111,polar = True)
# for i in range(len(plot_data)):
#     ax.plot(angles, plot_data[i], 'o-', label='聚类群体'+str(i+1))
# ax.set_thetagrids(angles*180/np.pi, label)
# plt.legend(bbox_to_anchor=(0.8, 1.15),ncol=3) # 设置图例的位置
# plt.show()

 类似资料: