import pandas as pd
from plotnine import *
import numpy as np
'''柱状图
stat参数有两个(默认'bin'(一个变量,即数量直方图)与'identity'(有x与y))
position有三个参数('stack','dodge','fill'(百分比))'''
#一、等宽柱形图(即只反映一个维度信息)
#------------单数据柱形图------------
mydata=pd.DataFrame({'cut':['fair','good','verygood','premium','ideal'],
'price':[4300,3800,3950,4700,3500]})
sort_data=mydata.sort_values(by='price',ascending=False)
#只是对值进行排序,但是画图时横坐标依然会默认是按照字母顺序排序,所以需要对cut进行有序化处理
sort_data['cut']=pd.Categorical(sort_data['cut'],ordered=True,categories=sort_data['cut'])
#这里添加一列的目标是用于数据标签映射位置(R可以在ggplot中直接y=price+100,而python用y='price+100',x,y分别表示标签所在位置,x默认柱子中间)
#另外也可以直接geom_text(aes(label='price'),nudge_x=0.3,nudge_y=0.3)进行调整
base_plot=(ggplot(sort_data,aes(x='cut',y='price'))+geom_bar(stat='identity',width=0.8,colour='black',size=0.25,fill='red',alpha=0.5)+
geom_text(aes(label='price',y='price+100'))+ylab('price'))
print(base_plot)
#-----------多数据柱形图-------------
df=pd.DataFrame({'1967':[7,4,5,1,2,3],'1968':[4,3,5,10,4,8],'name':['mike','jcak','jhon','mack','tony','martain']})
df=pd.melt(df,id_vars='name',var_name='year',value_name='income')
df['name']=pd.Categorical(df['name'])
#position=dodge控制柱形并排展示,position=stack为堆积
base_plot=(ggplot(df,aes(x='name',y='income',fill='year'))+geom_bar(stat='identity',colour='black',width=0.5,position='dodge'))
print(base_plot)
#--------------多数据堆积图----------------
df=pd.read_excel(r'E:\书籍\大三下\商务统计学\数据集\statistics for business and economics 11e Data Files\Alumni.xlsx')
print(type(df))
#只用前10个学校的数据进行展示
df=df.iloc[1:11,:]
'''@type df: pandas.core.frame.DataFrame'''
#首先进行声明变量类型,方便自动填充函数
sum_df=df.iloc[:,1:].apply(lambda x: x.sum(),axis=1)
df['sum']=sum_df
df=df.sort_values(by='sum')
df['School']=pd.Categorical(df['School'],ordered=True,categories=df['School'])
#删除求和列,并进行数据格式转换
df.pop('sum')
df_melt=df.melt(id_vars='School')
base_plot=(ggplot(df_melt,aes(x='School',y='value',fill='variable'))+
geom_bar(stat='identity',colour='black',width=0.5)+coord_flip()+
geom_text(aes(label='value',y='value+2'),position="stack"))
print(base_plot)
#二、不等宽柱形图(反映两个维度数据)
mydata=pd.DataFrame(dict(Name=['A','B','C','D','E'],Scale=[35,30,20,10,5],
ARPU=[56,37,63,57,59]))
#https://www.jianshu.com/p/72274ccb647a 链式操作存在的问题(程序不知道修改的是返回的数据框还是原数据框),因此修改dataframe尽量用loc方法一步到位
#构造矩形X轴的起点(最小点) 除了loc方法,其他使用1:2都是不包括后面的数字
mydata['xmin']=0
for i in range(1,5):
mydata.loc[i,'xmin']=np.sum(mydata['Scale'][0:i])
#构造矩形X轴的终点(最大点)
mydata['xmax']=0
for i in range(5):
mydata.loc[i,'xmax']=np.sum(mydata['Scale'][0:i+1])
#记录数据标签的x坐标
mydata['label']=0
for i in range(5):
mydata.loc[i,'label']=np.sum(mydata['Scale'][0:i+1])-mydata['Scale'][i]/2
base_plot=(ggplot(mydata)+
geom_rect(aes(xmin='xmin',xmax='xmax',ymin=0,ymax='ARPU',fill='Name'),colour='black',size=0.25)+
geom_text(aes(label='ARPU',x='label',y='ARPU+3'))+
geom_text(aes(label='Name',x='label',y=-1))+ylab('ARPU'))
print(base_plot)
#关系型数据图标(散点图,曲线图)
import pandas as pd
import numpy as np
from plotnine import *
#1、趋势关系
#散点图以及曲线拟合
df=pd.read_excel(r'E:\书籍\大三下\商务统计学\数据集\statistics for business and economics 11e Data Files\Alumni.xlsx')
'''@type df: pandas.core.frame.DataFrame'''
print(df.head())
#修改列名
df.columns=['a','x','y','z']
plot_losess=(ggplot(df,aes('x','y'))+geom_point(fill='black',colour='black',shape='o',alpha=0.8)+
geom_smooth(method='ols',span=0.4,colour='blue',se=True,fill='blue')+
scale_y_continuous(breaks=np.arange(0,150,25)))
print(plot_losess)
#残差分析图
import statsmodels.api as sm
results=sm.OLS(df.y,df.x).fit()
#残差值
df['predicted']=results.predict()
df['res']=df.predicted-df.y
df['absres']=np.abs(df.res)
base_res=((ggplot(df,aes(x='x',y='y')))+
#原始数据点
geom_point(aes(fill='absres',size='absres'),shape='o',colour='black')+
#预测的直线和点
geom_line(aes(y='predicted'),color='grey')+
geom_point(aes(y='predicted'),shape='o')+
#这里默认起始位置为ggplot中的x,y,只需要设置终点位置
geom_segment(aes(xend='x',yend='predicted'),alpha=0.2)+
scale_fill_gradientn(colors=['black','red'])+
#fill要用连续,size要用离散
guides(fill=guide_colorbar(title='res'),
size=guide_legend(title='res')))
print(base_res)
#2、分布关系
#qq图
import pandas as pd
from plotnine import *
import numpy as np
import random as rd
df=pd.DataFrame(dict(x=np.random.normal(loc=10,scale=1,size=250)))
base_plot=(ggplot(df,aes(sample='x'))+
geom_qq(shape='o',fill='none')+
geom_qq_line())
print(base_plot)
#多系列散点图
#散点图
df=pd.read_excel(r'E:\书籍\大三下\商务统计学\数据集\statistics for business and economics 11e Data Files\Alumni.xlsx')
df=df.iloc[:,1:4]
'''@type df: pandas.core.frame.DataFrame'''
df.columns=['x','y','z']
df['type']=rd.choices(['a','b','c'],k=len(df.x))
base_plot=(ggplot(df,aes(x='x',y='y',colour='type',shape='type',alpha='y'))+
#关闭某个图例
geom_point()+guides(alpha=False))
print(base_plot)
#气泡图
base_plot=(ggplot(df,aes(x='x',y='y'))+
geom_point(aes(size='z',fill='type'),shape='o',alpha=0.7)+
scale_fill_gradientn(colors=['blue','red'])+
geom_text(aes(label='y'),nudge_x=0.3,nudge_y=0.5))
print(base_plot)
#曲线图
import numpy as np
import pandas as pd
from plotnine import *
df=pd.read_excel(r'E:\书籍\大三下\商务统计学\数据集\statistics for business and economics 11e Data Files\Alumni.xlsx')
'''@type df: pandas.core.frame.DataFrame'''
df.columns=['type','x','y','z']
line_plot1=(ggplot(df,aes(x='x',y='y'))+
geom_point(shape='o',colour='black',fill='none')+geom_line(colour='grey'))
print(line_plot1)
line_plot2=(ggplot(df,aes(x='x',y='y'))+
geom_point(shape='o',colour='black',fill='none')+geom_path(aes(colour='x')))
print(line_plot2)
#3、相关关系图
import numpy as np
import pandas as pd
from plotnine import *
from plotnine.data import mtcars
#mtcars.corr()得到相关系数矩阵
#round是取一位小数,reset_index是恢复行索引为自然索引,原来索引变为数据,
# 否则相关系数矩阵行索引有名称
mat_corr=np.round(mtcars.corr(),1).reset_index()
mydata=pd.melt(mat_corr,id_vars='index',var_name='var',value_name='value')
mydata['Absvalue']=np.abs(mydata.value)
#圆形图
base_plot=(ggplot(mydata,aes(x='index',y='var',fill='value',size='Absvalue'))+
geom_point(shape='o',colour='black')+
scale_size_area(max_size=11)+
scale_fill_cmap(name='RdYIBu_r')+
coord_equal()+
guides(size=False))
print(base_plot)
#方块图
base_plot=(ggplot(mydata,aes(x='index',y='var',fill='value'))+
geom_tile(colour='black')+
scale_size_area(max_size=11)+
scale_fill_cmap(name='RdYIBu_r')+
coord_equal()+
guides(size=False))
print(base_plot)
import pandas as pd
import numpy as np
from plotnine import *
from plotnine.data import mtcars
print(mtcars.head())
df=pd.read_excel(r'E:\书籍\大三下\商务统计学\数据集\statistics for business and economics 11e Data Files\Alumni.xlsx')
'''@type df: pandas.core.frame.DataFrame'''
df.columns=['type','x','y','z']
#geom_histogram默认stat = "bin",生成新变量..count..和..density..,当作y,且y默认取..count..
#若已经统计好,则stat='identuty'
#频数
base_plot=(ggplot(df)+stat_density()+geom_histogram(aes(x='x'),breaks=np.arange(0,90,5),
fill='red',binwidth=0.5,colour='black'))
print(base_plot)
#频率
base_plot1=(ggplot(df)+geom_histogram(aes(x='x',y='..density..'),breaks=np.arange(0,90,5),
fill='red',binwidth=0.5,colour='black'))
print(base_plot)
#密度
base_plot2=(ggplot(mtcars)+geom_density(aes(x='qsec',fill='factor(gear)'),colour='black',
alpha=0.4))
print(base_plot2)
#二维变量频率分布图
(ggplot(mtcars)+geom_bin2d(aes(x='mpg',y='hp',fill='..density..')))
#散点数据分布图
import pandas as pd
import numpy as np
from plotnine import *
from plotnine.data import mtcars
base_plot=(ggplot(mtcars,aes(x='factor(gear)',y='disp',fill='factor(gear)'))+
geom_jitter(aes(group='factor(gear)'),position=position_jitterdodge())+
geom_boxplot(alpha=0.4))
print(base_plot)
#折线---line
#面积--area
#夹层填充--ribbon
import pandas as pd
import numpy as np
import random as rd
from plotnine import *
from plotnine.data import mtcars
#折线图
mydata=pd.DataFrame(dict(date=np.arange(2000,2020),c=rd.sample(set(np.arange(2,100)),20),
d=rd.sample(set(np.arange(3,150)),20)))
mydata=pd.melt(mydata,id_vars='date',var_name='country',value_name='gdp')
base_plot=(ggplot(mydata,aes(x='date',y='gdp',colour='country'))+
geom_line())
print(base_plot)
#面积图
base_plot1=(ggplot(mydata,aes(x='date',y='gdp'))+
geom_area(aes(fill='country'),position=position_identity(),alpha=0.4)+
geom_line(aes(colour='country')))
print(base_plot1)
#夹层填充
import pandas as pd
import numpy as np
import random as rd
from plotnine import *
from scipy import interpolate #导入插值方法
mydata=pd.DataFrame(dict(date=np.arange(1970,2020),c=rd.sample(set(np.arange(2,100)),50),
d=rd.sample(set(np.arange(3,150)),50)))
#进行线性插值
f=interpolate.interp1d(mydata['date'],mydata['c'],kind='slinear')
g=interpolate.interp1d(mydata['date'],mydata['d'],kind='slinear')
x_new=np.linspace(np.min(mydata['date']),np.max(mydata['date']),600)
c_new=f(x_new);d_new=g(x_new)
mydata_new=pd.DataFrame(dict(x=x_new,c=c_new,d=d_new))
# mydata_new=pd.melt(mydata_new,id_vars='x',var_name='country',value_name='gdp')
# mydata_new=pd.pivot_table(mydata_new,index='date',columns='country',values='gdp')
# mydata_new=mydata_new.reset_index()
mydata_new['ymin1']=mydata_new.loc[:,'c']
mydata_new['ymax1']=mydata_new.loc[:,'d']
mydata_new['ymin2']=mydata_new.loc[:,'d']
mydata_new['ymax2']=mydata_new.loc[:,'c']
mydata_new.loc[mydata_new.c>=mydata_new.d,'ymin1']=np.nan
mydata_new.loc[mydata_new.c>=mydata_new.d,'ymax1']=np.nan
mydata_new.loc[mydata_new.c<mydata_new.d,'ymin2']=np.nan
mydata_new.loc[mydata_new.c<mydata_new.d,'ymax2']=np.nan
base_plot3=(ggplot(mydata_new)+
geom_line(aes(x='x',y='c'),colour='green')+
geom_line(aes(x='x',y='d'),colour='black')+
geom_ribbon(aes(x='x',ymin='ymin1',ymax='ymax1'),fill='blue',alpha=0.5)+
geom_ribbon(aes(x='x',ymin='ymin2',ymax='ymax2'),fill='red',alpha=0.5))
print(base_plot3)
#数据处理时用宽型数据更方便
import pandas as pd
import numpy as np
from plotnine import *
df=pd.DataFrame(dict(segemt='A B C D'.split(' '),a=[2400,1200,600,250],b=[1000,900,600,250],g=[400,600,400,250],d=[200,300,400,250]))
df=df.set_index('segemt')
melt_df=pd.melt(df.reset_index(),id_vars='segemt',var_name='variable',value_name='value')
df_rowsum=df.apply(lambda x:x.sum(),axis=1)
for i in df_rowsum.index:
for j in df.columns:
df.loc[i,j]=df.loc[i,j]/df_rowsum[i]*100
df_rowsum=df_rowsum/np.sum(df_rowsum)*100
#累加
df['xmax']=np.cumsum(df_rowsum)
df['xmin']=df['xmax']-df_rowsum
dfm=pd.melt(df.reset_index(),id_vars=['segemt','xmin','xmax'],value_name='percentage')
#groupby后traansform与sum的区别,前者返回长度与原来数据框长度一样,而后者只返回groupby种类长度
dfm['ymax']=dfm.groupby('segemt')['percentage'].transform(lambda x:np.cumsum(x))
dfm['ymin']=dfm.apply(lambda x:x['ymax']-x['percentage'],axis=1)
dfm['xtext']=dfm['xmin']+(dfm['xmax']-dfm['xmin'])/2
dfm['ytext']=dfm['ymin']+(dfm['ymax']-dfm['ymin'])/2
dfm=pd.merge(left=melt_df,right=dfm,how='left',on=['segemt','variable'])
df_label=pd.DataFrame(dict(x=np.repeat(102,4),y=np.arange(12.5,100,25),label=['a','b','g','d']))
base_plot=(ggplot()+
geom_rect(dfm,aes(xmin='xmin',xmax='xmax',ymin='ymin',ymax='ymax',fill='variable'),colour='black')+
geom_text(dfm,aes(x='xtext',y='ytext',label='value'),size=10)+
geom_text(dfm,aes(x='xtext',y=103,label='segemt'),size=13)+
geom_text(df_label,aes(x='x',y='y',label='label'),ha='left'))
print(base_plot)
小技巧:
函数 | 功能 |
---|---|
dodge() | 水平并列放置 |
identity() | 位置不变 |
stack() | 垂直堆叠 |
fill() | 百分比堆叠 |
jitter() | 扰动 |
jitterdodge() | 并列抖动 |
nudge(x=…,y=…) | 微调位置 |