第07章 分组聚合、过滤、转换

优质
小牛编辑
134浏览
2023-12-01
 In[1]: import pandas as pd
        import numpy as np

1. 定义聚合

# 读取flights数据集,查询头部
 In[2]: flights = pd.read_csv('data/flights.csv')
        flights.head()
Out[2]:

# 按照AIRLINE分组,使用agg方法,传入要聚合的列和聚合函数
 In[3]: flights.groupby('AIRLINE').agg({'ARR_DELAY':'mean'}).head()
Out[3]:
# 或者要选取的列使用索引,聚合函数作为字符串传入agg
 In[4]: flights.groupby('AIRLINE')['ARR_DELAY'].agg('mean').head()
Out[4]: 
AIRLINE
AA     5.542661
AS    -0.833333
B6     8.692593
DL     0.339691
EV     7.034580
Name: ARR_DELAY, dtype: float64

# 也可以向agg中传入NumPy的mean函数
 In[5]: flights.groupby('AIRLINE')['ARR_DELAY'].agg(np.mean).head()
Out[5]:

# 也可以直接使用mean()函数
 In[6]: flights.groupby('AIRLINE')['ARR_DELAY'].mean().head()
Out[6]:

原理

# groupby方法产生的是一个DataFrameGroupBy对象
 In[7]: grouped = flights.groupby('AIRLINE')
        type(grouped)
Out[7]: pandas.core.groupby.DataFrameGroupBy

更多

# 如果agg接收的不是聚合函数,则会导致异常
 In[8]: flights.groupby('AIRLINE')['ARR_DELAY'].agg(np.sqrt)
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py:842: RuntimeWarning: invalid value encountered in sqrt
  f = lambda x: func(x, *args, **kwargs)
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py:3015: RuntimeWarning: invalid value encountered in sqrt
  output = func(group, *args, **kwargs)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in agg_series(self, obj, func)
   2177         try:
-> 2178             return self._aggregate_series_fast(obj, func)
   2179         except Exception:

/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in _aggregate_series_fast(self, obj, func)
   2197                                     dummy)
-> 2198         result, counts = grouper.get_result()
   2199         return result, counts

pandas/_libs/src/reduce.pyx in pandas._libs.lib.SeriesGrouper.get_result (pandas/_libs/lib.c:39105)()

pandas/_libs/src/reduce.pyx in pandas._libs.lib.SeriesGrouper.get_result (pandas/_libs/lib.c:38973)()

pandas/_libs/src/reduce.pyx in pandas._libs.lib._get_result_array (pandas/_libs/lib.c:32039)()

ValueError: function does not reduce

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in aggregate(self, func_or_funcs, *args, **kwargs)
   2882             try:
-> 2883                 return self._python_agg_general(func_or_funcs, *args, **kwargs)
   2884             except Exception:

/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in _python_agg_general(self, func, *args, **kwargs)
    847             try:
--> 848                 result, counts = self.grouper.agg_series(obj, f)
    849                 output[name] = self._try_cast(result, obj, numeric_only=True)

/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in agg_series(self, obj, func)
   2179         except Exception:
-> 2180             return self._aggregate_series_pure_python(obj, func)
   2181 

/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in _aggregate_series_pure_python(self, obj, func)
   2214                         isinstance(res, list)):
-> 2215                     raise ValueError('Function does not reduce')
   2216                 result = np.empty(ngroups, dtype='O')

ValueError: Function does not reduce

During handling of the above exception, another exception occurred:

Exception                                 Traceback (most recent call last)
<ipython-input-8-2bcc9ccfec77> in <module>()
----> 1 flights.groupby('AIRLINE')['ARR_DELAY'].agg(np.sqrt)

/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in aggregate(self, func_or_funcs, *args, **kwargs)
   2883                 return self._python_agg_general(func_or_funcs, *args, **kwargs)
   2884             except Exception:
-> 2885                 result = self._aggregate_named(func_or_funcs, *args, **kwargs)
   2886 
   2887             index = Index(sorted(result), name=self.grouper.names[0])

/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in _aggregate_named(self, func, *args, **kwargs)
   3015             output = func(group, *args, **kwargs)
   3016             if isinstance(output, (Series, Index, np.ndarray)):
-> 3017                 raise Exception('Must produce aggregated value')
   3018             result[name] = self._try_cast(output, group)
   3019 

Exception: Must produce aggregated value

2. 用多个列和函数进行分组和聚合

# 导入数据
 In[9]: flights = pd.read_csv('data/flights.csv')
        flights.head()
Out[9]:

# 每家航空公司每周平均每天取消的航班数
 In[10]: flights.groupby(['AIRLINE', 'WEEKDAY'])['CANCELLED'].agg('sum').head(7)
Out[10]: AIRLINE  WEEKDAY
         AA       1          41
                  2           9
                  3          16
                  4          20
                  5          18
                  6          21
                  7          29
         Name: CANCELLED, dtype: int64
# 分组可以是多组,选取可以是多组,聚合函数也可以是多个
# 每周每家航空公司取消或改变航线的航班总数和比例
 In[11]: flights.groupby(['AIRLINE', 'WEEKDAY'])['CANCELLED', 'DIVERTED'].agg(['sum', 'mean']).head(7)
Out[11]:

# 用列表和嵌套字典对多列分组和聚合
# 对于每条航线,找到总航班数,取消的数量和比例,飞行时间的平均时间和方差
 In[12]: group_cols = ['ORG_AIR', 'DEST_AIR']
         agg_dict = {'CANCELLED':['sum', 'mean', 'size'], 
                     'AIR_TIME':['mean', 'var']}
         flights.groupby(group_cols).agg(agg_dict).head()
         # flights.groupby(['ORG_AIR', 'DEST_AIR']).agg({'CANCELLED': ['sum', 'mean', 'size'], 
         #                                               'AIR_TIME':['mean', 'var']}).head()
Out[12]:

3. 分组后去除多级索引

# 读取数据
 In[13]: flights = pd.read_csv('data/flights.csv')
         flights.head()
Out[13]:

# 按'AIRLINE', 'WEEKDAY'分组,分别对DIST和ARR_DELAY聚合
 In[14]: airline_info = flights.groupby(['AIRLINE', 'WEEKDAY'])\
                               .agg({'DIST':['sum', 'mean'], 
                                     'ARR_DELAY':['min', 'max']}).astype(int)
         airline_info.head()
Out[14]:

# 行和列都有两级索引,get_level_values(0)取出第一级索引
 In[15]: level0 = airline_info.columns.get_level_values(0)
         level0
Out[15]: Index(['DIST', 'DIST', 'ARR_DELAY', 'ARR_DELAY'], dtype='object')
# get_level_values(1)取出第二级索引
 In[16]: level1 = airline_info.columns.get_level_values(1)
         level1
Out[16]: Index(['sum', 'mean', 'min', 'max'], dtype='object')
# 一级和二级索引拼接成新的列索引
 In[17]: airline_info.columns = level0 + '_' + level1
 In[18]: airline_info.head(7)
Out[18]:

# reset_index()可以将行索引变成单级
 In[19]: airline_info.reset_index().head(7)
Out[19]:

更多

# Pandas默认会在分组运算后,将所有分组的列放在索引中,as_index设为False可以避免这么做。分组后使用reset_index,也可以达到同样的效果
 In[20]: flights.groupby(['AIRLINE'], as_index=False)['DIST'].agg('mean').round(0)
Out[20]:

# 上面这么做,会默认对AIRLINE排序,sort设为False可以避免排序
 In[21]: flights.groupby(['AIRLINE'], as_index=False, sort=False)['DIST'].agg('mean')
Out[21]:

4. 自定义聚合函数

 In[22]: college = pd.read_csv('data/college.csv')
         college.head()
Out[22]:

# 求出每个州的本科生的平均值和标准差
 In[23]: college.groupby('STABBR')['UGDS'].agg(['mean', 'std']).round(0).head()
Out[23]:

# 远离平均值的标准差的最大个数,写一个自定义函数
 In[24]: def max_deviation(s):
             std_score = (s - s.mean()) / s.std()
             return std_score.abs().max()
# agg聚合函数在调用方法时,直接引入自定义的函数名
 In[25]: college.groupby('STABBR')['UGDS'].agg(max_deviation).round(1).head()
Out[25]: STABBR
         AK    2.6
         AL    5.8
         AR    6.3
         AS    NaN
         AZ    9.9
         Name: UGDS, dtype: float64

更多

# 自定义的聚合函数也适用于多个数值列
 In[26]: college.groupby('STABBR')['UGDS', 'SATVRMID', 'SATMTMID'].agg(max_deviation).round(1).head()
Out[26]:

# 自定义聚合函数也可以和预先定义的函数一起使用
 In[27]: college.groupby(['STABBR', 'RELAFFIL'])['UGDS', 'SATVRMID', 'SATMTMID']\
                .agg([max_deviation, 'mean', 'std']).round(1).head()
Out[27]:

# Pandas使用函数名作为返回列的名字;你可以直接使用rename方法修改,或通过__name__属性修改
 In[28]: max_deviation.__name__
Out[28]: 'max_deviation'

 In[29]: max_deviation.__name__ = 'Max Deviation'
 In[30]: college.groupby(['STABBR', 'RELAFFIL'])['UGDS', 'SATVRMID', 'SATMTMID']\
                .agg([max_deviation, 'mean', 'std']).round(1).head()
Out[30]:

5. 用 args 和 *kwargs 自定义聚合函数

# 用inspect模块查看groupby对象的agg方法的签名
 In[31]: college = pd.read_csv('data/college.csv')
         grouped = college.groupby(['STABBR', 'RELAFFIL'])
 In[32]: import inspect
         inspect.signature(grouped.agg)
Out[32]: <Signature (arg, *args, **kwargs)>

如何做

# 自定义一个返回去本科生人数在1000和3000之间的比例的函数
 In[33]: def pct_between_1_3k(s):
             return s.between(1000, 3000).mean()
# 用州和宗教分组,再聚合
 In[34]: college.groupby(['STABBR', 'RELAFFIL'])['UGDS'].agg(pct_between_1_3k).head(9)
Out[34]: 
STABBR  RELAFFIL
AK      0           0.142857
        1           0.000000
AL      0           0.236111
        1           0.333333
AR      0           0.279412
        1           0.111111
AS      0           1.000000
AZ      0           0.096774
        1           0.000000
Name: UGDS, dtype: float64
# 但是这个函数不能让用户自定义上下限,再新写一个函数
 In[35]: def pct_between(s, low, high):
             return s.between(low, high).mean()
# 使用这个自定义聚合函数,并传入最大和最小值
 In[36]: college.groupby(['STABBR', 'RELAFFIL'])['UGDS'].agg(pct_between, 1000, 10000).head(9)
Out[36]: 
STABBR  RELAFFIL
AK      0           0.428571
        1           0.000000
AL      0           0.458333
        1           0.375000
AR      0           0.397059
        1           0.166667
AS      0           1.000000
AZ      0           0.233871
        1           0.111111
Name: UGDS, dtype: float64

原理

# 显示指定最大和最小值
 In[37]: college.groupby(['STABBR', 'RELAFFIL'])['UGDS'].agg(pct_between, high=10000, low=1000).head(9)
Out[37]: 
STABBR  RELAFFIL
AK      0           0.428571
        1           0.000000
AL      0           0.458333
        1           0.375000
AR      0           0.397059
        1           0.166667
AS      0           1.000000
AZ      0           0.233871
        1           0.111111
Name: UGDS, dtype: float64
# 也可以关键字参数和非关键字参数混合使用,只要非关键字参数在后面
 In[38]: college.groupby(['STABBR', 'RELAFFIL'])['UGDS'].agg(pct_between, 1000, high=10000).head(9)
Out[38]: 
STABBR  RELAFFIL
AK      0           0.428571
        1           0.000000
AL      0           0.458333
        1           0.375000
AR      0           0.397059
        1           0.166667
AS      0           1.000000
AZ      0           0.233871
        1           0.111111
Name: UGDS, dtype: float64

更多

# Pandas不支持多重聚合时,使用参数
 In[39]: college.groupby(['STABBR', 'RELAFFIL'])['UGDS'].agg(['mean', pct_between], low=100, high=1000)
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-39-3e3e18919cf9> in <module>()
----> 1 college.groupby(['STABBR', 'RELAFFIL'])['UGDS'].agg(['mean', pct_between], low=100, high=1000)

/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in aggregate(self, func_or_funcs, *args, **kwargs)
   2871         if hasattr(func_or_funcs, '__iter__'):
   2872             ret = self._aggregate_multiple_funcs(func_or_funcs,
-> 2873                                                  (_level or 0) + 1)
   2874         else:
   2875             cyfunc = self._is_cython_func(func_or_funcs)

/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in _aggregate_multiple_funcs(self, arg, _level)
   2944                 obj._reset_cache()
   2945                 obj._selection = name
-> 2946             results[name] = obj.aggregate(func)
   2947 
   2948         if isinstance(list(compat.itervalues(results))[0],

/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in aggregate(self, func_or_funcs, *args, **kwargs)
   2878 
   2879             if self.grouper.nkeys > 1:
-> 2880                 return self._python_agg_general(func_or_funcs, *args, **kwargs)
   2881 
   2882             try:

/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in _python_agg_general(self, func, *args, **kwargs)
    852 
    853         if len(output) == 0:
--> 854             return self._python_apply_general(f)
    855 
    856         if self.grouper._filter_empty_groups:

/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in _python_apply_general(self, f)
    718     def _python_apply_general(self, f):
    719         keys, values, mutated = self.grouper.apply(f, self._selected_obj,
--> 720                                                    self.axis)
    721 
    722         return self._wrap_applied_output(

/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in apply(self, f, data, axis)
   1800             # group might be modified
   1801             group_axes = _get_axes(group)
-> 1802             res = f(group)
   1803             if not _is_indexed_like(res, group_axes):
   1804                 mutated = True

/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in <lambda>(x)
    840     def _python_agg_general(self, func, *args, **kwargs):
    841         func = self._is_builtin_func(func)
--> 842         f = lambda x: func(x, *args, **kwargs)
    843 
    844         # iterate through "columns" ex exclusions to populate output dict

TypeError: pct_between() missing 2 required positional arguments: 'low' and 'high'
# 用闭包自定义聚合函数
 In[40]: def make_agg_func(func, name, *args, **kwargs):
             def wrapper(x):
                 return func(x, *args, **kwargs)
             wrapper.__name__ = name
             return wrapper

         my_agg1 = make_agg_func(pct_between, 'pct_1_3k', low=1000, high=3000)
         my_agg2 = make_agg_func(pct_between, 'pct_10_30k', 10000, 30000)['UGDS'].agg(pct_between, 1000, high=10000).head(9)
Out[41]:

6. 检查分组对象

# 查看分组对象的类型
 In[42]: college = pd.read_csv('data/college.csv')
         grouped = college.groupby(['STABBR', 'RELAFFIL'])
         type(grouped)
Out[42]: pandas.core.groupby.DataFrameGroupBy
# 用dir函数找到该对象所有的可用函数
 In[43]: print([attr for attr in dir(grouped) if not attr.startswith('_')])
['CITY', 'CURROPER', 'DISTANCEONLY', 'GRAD_DEBT_MDN_SUPP', 'HBCU', 'INSTNM', 'MD_EARN_WNE_P10', 'MENONLY', 'PCTFLOAN', 'PCTPELL', 'PPTUG_EF', 'RELAFFIL', 'SATMTMID', 'SATVRMID', 'STABBR', 'UG25ABV', 'UGDS', 'UGDS_2MOR', 'UGDS_AIAN', 'UGDS_ASIAN', 'UGDS_BLACK', 'UGDS_HISP', 'UGDS_NHPI', 'UGDS_NRA', 'UGDS_UNKN', 'UGDS_WHITE', 'WOMENONLY', 'agg', 'aggregate', 'all', 'any', 'apply', 'backfill', 'bfill', 'boxplot', 'corr', 'corrwith', 'count', 'cov', 'cumcount', 'cummax', 'cummin', 'cumprod', 'cumsum', 'describe', 'diff', 'dtypes', 'expanding', 'ffill', 'fillna', 'filter', 'first', 'get_group', 'groups', 'head', 'hist', 'idxmax', 'idxmin', 'indices', 'last', 'mad', 'max', 'mean', 'median', 'min', 'ndim', 'ngroup', 'ngroups', 'nth', 'nunique', 'ohlc', 'pad', 'pct_change', 'plot', 'prod', 'quantile', 'rank', 'resample', 'rolling', 'sem', 'shift', 'size', 'skew', 'std', 'sum', 'tail', 'take', 'transform', 'tshift', 'var']
# 用ngroups属性查看分组的数量
 In[44]: grouped.ngroups
Out[44]: 112
# 查看每个分组的唯一识别标签,groups属性是一个字典,包含每个独立分组与行索引标签的对应
 In[45]: groups = list(grouped.groups.keys())
         groups[:6]
Out[45]: [('AK', 0), ('AK', 1), ('AL', 0), ('AL', 1), ('AR', 0), ('AR', 1)]
# 用get_group,传入分组标签的元组。例如,获取佛罗里达州所有与宗教相关的学校
 In[46]: grouped.get_group(('FL', 1)).head()
Out[46]:

# groupby对象是一个可迭代对象,可以挨个查看每个独立分组
 In[47]: from IPython.display import display
 In[48]: i = 0
         for name, group in grouped:
             print(name)
             display(group.head(2))
             i += 1
             if i == 5:
                 break

# groupby对象使用head方法,可以在一个DataFrame钟显示每个分组的头几行
 In[49]: grouped.head(2).head(6)
Out[49]:

更多

# nth方法可以选出每个分组指定行的数据,下面选出的是第1行和最后1行
 In[50]: grouped.nth([1, -1]).head(8)
Out[50]:

7. 过滤状态

 In[51]: college = pd.read_csv('data/college.csv', index_col='INSTNM')
         grouped = college.groupby('STABBR')
         grouped.ngroups
Out[51]: 59
# 这等于求出不同州的个数,nunique()可以得到同样的结果
 In[52]: college['STABBR'].nunique()
Out[52]: 59
# 自定义一个计算少数民族学生总比例的函数,如果比例大于阈值,还返回True
 In[53]: def check_minority(df, threshold):
             minority_pct = 1 - df['UGDS_WHITE']
             total_minority = (df['UGDS'] * minority_pct).sum()
             total_ugds = df['UGDS'].sum()
             total_minority_pct = total_minority / total_ugds
             return total_minority_pct > threshold
# grouped变量有一个filter方法,可以接收一个自定义函数,决定是否保留一个分组
 In[54]: college_filtered = grouped.filter(check_minority, threshold=.5)
         college_filtered.head()
Out[54]:

# 通过查看形状,可以看到过滤了60%,只有20个州的少数学生占据多数
 In[55]: college.shape
Out[55]: (7535, 26)

 In[56]: college_filtered.shape
Out[56]: (3028, 26)

 In[57]: college_filtered['STABBR'].nunique()
Out[57]: 20

更多

# 用一些不同的阈值,检查形状和不同州的个数
 In[58]: college_filtered_20 = grouped.filter(check_minority, threshold=.2)
        college_filtered_20.shape
Out[58]: (7461, 26)

 In[59]: college_filtered_20['STABBR'].nunique()
Out[59]: 57

 In[60]: college_filtered_70 = grouped.filter(check_minority, threshold=.7)
         college_filtered_70.shape
Out[60]: (957, 26)

 In[61]: college_filtered_70['STABBR'].nunique()
Out[61]: 10

 In[62]: college_filtered_95 = grouped.filter(check_minority, threshold=.95)
         college_filtered_95.shape
Out[62]: (156, 26)

8. 减肥对赌

# 读取减肥数据集,查看一月的数据
 In[63]: weight_loss = pd.read_csv('data/weight_loss.csv')
         weight_loss.query('Month == "Jan"')
Out[63]:

# 定义一个求减肥比例的函数
 In[64]: def find_perc_loss(s):
            return (s - s.iloc[0]) / s.iloc[0]
# 查看Bob在一月的减肥成果
 In[65]: bob_jan = weight_loss.query('Name=="Bob" and Month=="Jan"')
         find_perc_loss(bob_jan['Weight'])
Out[65]: 0    0.000000
         2   -0.010309
         4   -0.027491
         6   -0.027491
         Name: Weight, dtype: float64
# 对Name和Month进行分组,然后使用transform方法,传入函数,对数值进行转换
 In[66]: pcnt_loss = weight_loss.groupby(['Name', 'Month'])['Weight'].transform(find_perc_loss)
         pcnt_loss.head(8)
Out[66]: 0    0.000000
         1    0.000000
         2   -0.010309
         3   -0.040609
         4   -0.027491
         5   -0.040609
         6   -0.027491
         7   -0.035533
         Name: Weight, dtype: float64
# transform之后的结果,行数不变,可以赋值给原始DataFrame作为一个新列;
# 为了缩短输出,只选择Bob的前两个月数据
 In[67]: weight_loss['Perc Weight Loss'] = pcnt_loss.round(3)
         weight_loss.query('Name=="Bob" and Month in ["Jan", "Feb"]')
Out[67]:

# 因为最重要的是每个月的第4周,只选择第4周的数据
 In[68]: week4 = weight_loss.query('Week == "Week 4"')
         week4
Out[68]:

# 用pivot重构DataFrame,让Amy和Bob的数据并排放置
 In[69]: winner = week4.pivot(index='Month', columns='Name', values='Perc Weight Loss')
         winner
Out[69]:

# 用where方法选出每月的赢家
 In[70]: winner['Winner'] = np.where(winner['Amy'] < winner['Bob'], 'Amy', 'Bob')
         winner.style.highlight_min(axis=1)
Out[70]:

# 用value_counts()返回最后的比分
 In[71]: winner.Winner.value_counts()
Out[71]: Amy    3
         Bob    1
         Name: Winner, dtype: int64

更多

# Pandas默认是按字母排序的
 In[72]: week4a = week4.copy()
         month_chron = week4a['Month'].unique() 
         month_chron
Out[72]: array(['Jan', 'Feb', 'Mar', 'Apr'], dtype=object)
# 转换为Categorical变量,可以做成按时间排序
 In[73]: week4a['Month'] = pd.Categorical(week4a['Month'], 
                                          categories=month_chron,
                                          ordered=True)
         week4a.pivot(index='Month', columns='Name', values='Perc Weight Loss')
Out[73]:

9. 用apply计算每州的加权平均SAT分数

# 读取college,'UGDS', 'SATMTMID', 'SATVRMID'三列如果有缺失值则删除行
 In[74]: college = pd.read_csv('data/college.csv')
         subset = ['UGDS', 'SATMTMID', 'SATVRMID']
         college2 = college.dropna(subset=subset)
         college.shape
Out[74]: (7535, 27)

 In[75]: college2.shape
Out[75]: (1184, 27)
# 自定义一个求SAT数学成绩的加权平均值的函数
 In[76]: def weighted_math_average(df):
             weighted_math = df['UGDS'] * df['SATMTMID']
             return int(weighted_math.sum() / df['UGDS'].sum())
# 按州分组,并调用apply方法,传入自定义函数
 In[77]: college2.groupby('STABBR').apply(weighted_math_average).head()
Out[77]: STABBR
         AK    503
         AL    536
         AR    529
         AZ    569
         CA    564
         dtype: int64
# 效果同上
 In[78]: college2.groupby('STABBR').agg(weighted_math_average).head()
Out[78]:

# 如果将列限制到SATMTMID,会报错。这是因为不能访问UGDS。
 In[79]: college2.groupby('STABBR')['SATMTMID'].agg(weighted_math_average)
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5126)()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item (pandas/_libs/hashtable.c:14010)()

TypeError: an integer is required

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in agg_series(self, obj, func)
  2177         try:
-> 2178             return self._aggregate_series_fast(obj, func)
  2179         except Exception:

/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in _aggregate_series_fast(self, obj, func)
  2197                                     dummy)
-> 2198         result, counts = grouper.get_result()
  2199         return result, counts

pandas/_libs/src/reduce.pyx in pandas._libs.lib.SeriesGrouper.get_result (pandas/_libs/lib.c:39105)()

pandas/_libs/src/reduce.pyx in pandas._libs.lib.SeriesGrouper.get_result (pandas/_libs/lib.c:38888)()

/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in <lambda>(x)
   841         func = self._is_builtin_func(func)
--> 842         f = lambda x: func(x, *args, **kwargs)
   843 

<ipython-input-76-01eb90aa258d> in weighted_math_average(df)
     1 def weighted_math_average(df):
----> 2     weighted_math = df['UGDS'] * df['SATMTMID']
     3     return int(weighted_math.sum() / df['UGDS'].sum())

/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/series.py in __getitem__(self, key)
   600         try:
--> 601             result = self.index.get_value(self, key)
   602 

/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_value(self, series, key)
  2476             return self._engine.get_value(s, k,
-> 2477                                           tz=getattr(series.dtype, 'tz', None))
  2478         except KeyError as e1:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value (pandas/_libs/index.c:4404)()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value (pandas/_libs/index.c:4087)()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5210)()

KeyError: 'UGDS'

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5126)()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item (pandas/_libs/hashtable.c:14010)()

TypeError: an integer is required

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in aggregate(self, func_or_funcs, *args, **kwargs)
  2882             try:
-> 2883                 return self._python_agg_general(func_or_funcs, *args, **kwargs)
  2884             except Exception:

/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in _python_agg_general(self, func, *args, **kwargs)
   847             try:
--> 848                 result, counts = self.grouper.agg_series(obj, f)
   849                 output[name] = self._try_cast(result, obj, numeric_only=True)

/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in agg_series(self, obj, func)
  2179         except Exception:
-> 2180             return self._aggregate_series_pure_python(obj, func)
  2181 

/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in _aggregate_series_pure_python(self, obj, func)
  2210         for label, group in splitter:
-> 2211             res = func(group)
  2212             if result is None:

/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in <lambda>(x)
   841         func = self._is_builtin_func(func)
--> 842         f = lambda x: func(x, *args, **kwargs)
   843 

<ipython-input-76-01eb90aa258d> in weighted_math_average(df)
     1 def weighted_math_average(df):
----> 2     weighted_math = df['UGDS'] * df['SATMTMID']
     3     return int(weighted_math.sum() / df['UGDS'].sum())

/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/series.py in __getitem__(self, key)
   600         try:
--> 601             result = self.index.get_value(self, key)
   602 

/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_value(self, series, key)
  2476             return self._engine.get_value(s, k,
-> 2477                                           tz=getattr(series.dtype, 'tz', None))
  2478         except KeyError as e1:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value (pandas/_libs/index.c:4404)()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value (pandas/_libs/index.c:4087)()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5210)()

KeyError: 'UGDS'

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5126)()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item (pandas/_libs/hashtable.c:14010)()

TypeError: an integer is required

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-79-1351e4f306c7> in <module>()
----> 1 college2.groupby('STABBR')['SATMTMID'].agg(weighted_math_average)

/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in aggregate(self, func_or_funcs, *args, **kwargs)
  2883                 return self._python_agg_general(func_or_funcs, *args, **kwargs)
  2884             except Exception:
-> 2885                 result = self._aggregate_named(func_or_funcs, *args, **kwargs)
  2886 
  2887             index = Index(sorted(result), name=self.grouper.names[0])

/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in _aggregate_named(self, func, *args, **kwargs)
  3013         for name, group in self:
  3014             group.name = name
-> 3015             output = func(group, *args, **kwargs)
  3016             if isinstance(output, (Series, Index, np.ndarray)):
  3017                 raise Exception('Must produce aggregated value')

<ipython-input-76-01eb90aa258d> in weighted_math_average(df)
     1 def weighted_math_average(df):
----> 2     weighted_math = df['UGDS'] * df['SATMTMID']
     3     return int(weighted_math.sum() / df['UGDS'].sum())

/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/series.py in __getitem__(self, key)
   599         key = com._apply_if_callable(key, self)
   600         try:
--> 601             result = self.index.get_value(self, key)
   602 
   603             if not is_scalar(result):

/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_value(self, series, key)
  2475         try:
  2476             return self._engine.get_value(s, k,
-> 2477                                           tz=getattr(series.dtype, 'tz', None))
  2478         except KeyError as e1:
  2479             if len(self) > 0 and self.inferred_type in ['integer', 'boolean']:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value (pandas/_libs/index.c:4404)()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value (pandas/_libs/index.c:4087)()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5210)()

KeyError: 'UGDS'
# apply的一个不错的功能是通过返回Series,创建多个新的列
 In[80]: from collections import OrderedDict
         def weighted_average(df):
             data = OrderedDict()
             weight_m = df['UGDS'] * df['SATMTMID']
             weight_v = df['UGDS'] * df['SATVRMID']

             data['weighted_math_avg'] = weight_m.sum() / df['UGDS'].sum()
             data['weighted_verbal_avg'] = weight_v.sum() / df['UGDS'].sum()
             data['math_avg'] = df['SATMTMID'].mean()
             data['verbal_avg'] = df['SATVRMID'].mean()
             data['count'] = len(df)
             return pd.Series(data, dtype='int')

         college2.groupby('STABBR').apply(weighted_average).head(10)
Out[80]:

# 多创建两个新的列
 In[81]: from collections import OrderedDict
         def weighted_average(df):
             data = OrderedDict()
             weight_m = df['UGDS'] * df['SATMTMID']
             weight_v = df['UGDS'] * df['SATVRMID']

             wm_avg = weight_m.sum() / df['UGDS'].sum()
             wv_avg = weight_v.sum() / df['UGDS'].sum()

             data['weighted_math_avg'] = wm_avg
             data['weighted_verbal_avg'] = wv_avg
             data['math_avg'] = df['SATMTMID'].mean()
             data['verbal_avg'] = df['SATVRMID'].mean()
             data['count'] = len(df)
             return pd.Series(data, dtype='int')

         college2.groupby('STABBR').apply(weighted_average).head(10)
Out[81]:

更多

# 自定义一个返回DataFrame的函数,使用NumPy的函数average计算加权平均值,使用SciPy的gmean和hmean计算几何和调和平均值
 In[82]: from scipy.stats import gmean, hmean
         def calculate_means(df):
             df_means = pd.DataFrame(index=['Arithmetic', 'Weighted', 'Geometric', 'Harmonic'])
             cols = ['SATMTMID', 'SATVRMID']
             for col in cols:
                 arithmetic = df[col].mean()
                 weighted = np.average(df[col], weights=df['UGDS'])
                 geometric = gmean(df[col])
                 harmonic = hmean(df[col])
                 df_means[col] = [arithmetic, weighted, geometric, harmonic]

             df_means['count'] = len(df)
             return df_means.astype(int)

         college2.groupby('STABBR').filter(lambda x: len(x) != 1).groupby('STABBR').apply(calculate_means).head(10)
Out[82]:

10. 用连续变量分组

 In[83]: flights = pd.read_csv('data/flights.csv')
         flights.head()
Out[83]:

# 判断DIST列有无缺失值
 In[84]: flights.DIST.hasnans
Out[84]: False
# 再次删除DIST列的缺失值(原书是没有这两段的)
 In[85]: flights.dropna(subset=['DIST']).shape
Out[85]: (58492, 14)
# 使用Pandas的cut函数,将数据分成5个面元
 In[86]: bins = [-np.inf, 200, 500, 1000, 2000, np.inf]
         cuts = pd.cut(flights['DIST'], bins=bins)
         cuts.head()
Out[86]: 0     (500.0, 1000.0]
         1    (1000.0, 2000.0]
         2     (500.0, 1000.0]
         3    (1000.0, 2000.0]
         4    (1000.0, 2000.0]
         Name: DIST, dtype: category
         Categories (5, interval[float64]): [(-inf, 200.0] < (200.0, 500.0] < (500.0, 1000.0] < (1000.0, 2000.0] < (2000.0, inf]]
# 对每个面元进行统计
 In[87]: cuts.value_counts()
Out[87]: (500.0, 1000.0]     20659
         (200.0, 500.0]      15874
         (1000.0, 2000.0]    14186
         (2000.0, inf]        4054
         (-inf, 200.0]        3719
         Name: DIST, dtype: int64
# 面元Series可以用来进行分组
 In[88]: flights.groupby(cuts)['AIRLINE'].value_counts(normalize=True).round(3).head(15)
Out[88]: DIST            AIRLINE
         (-inf, 200.0]   OO         0.326
                         EV         0.289
                         MQ         0.211
                         DL         0.086
                         AA         0.052
                         UA         0.027
                         WN         0.009
         (200.0, 500.0]  WN         0.194
                         DL         0.189
                         OO         0.159
                         EV         0.156
                         MQ         0.100
                         AA         0.071
                         UA         0.062
                         VX         0.028
         Name: AIRLINE, dtype: float64

原理

 In[89]: flights.groupby(cuts)['AIRLINE'].value_counts(normalize=True)['AIRLINE'].value_counts(normalize=True).round(3).head(15)
Out[89]: 
DIST              AIRLINE
(-inf, 200.0]     OO         0.325625
                 EV         0.289325
                 MQ         0.210809
                 DL         0.086045
                 AA         0.052165
                 UA         0.027427
                 WN         0.008604
(200.0, 500.0]    WN         0.193902
                 DL         0.188736
                 OO         0.158687
                 EV         0.156293
                 MQ         0.100164
                 AA         0.071375
                 UA         0.062051
                 VX         0.028222
                 US         0.016001
                 NK         0.011843
                 B6         0.006867
                 F9         0.004914
                 AS         0.000945
(500.0, 1000.0]   DL         0.205625
                 AA         0.143908
                 WN         0.138196
                 UA         0.131129
                 OO         0.106443
                 EV         0.100683
                 MQ         0.051213
                 F9         0.038192
                 NK         0.029527
                 US         0.025316
                 AS         0.023234
                 VX         0.003582
                 B6         0.002953
(1000.0, 2000.0]  AA         0.263781
                 UA         0.199070
                 DL         0.165092
                 WN         0.159664
                 OO         0.046454
                 NK         0.045115
                 US         0.040462
                 F9         0.030664
                 AS         0.015931
                 EV         0.015579
                 VX         0.012125
                 B6         0.003313
                 MQ         0.002749
(2000.0, inf]     UA         0.289097
                 AA         0.211643
                 DL         0.171436
                 B6         0.080414
                 VX         0.073754
                 US         0.065121
                 WN         0.046374
                 HA         0.027627
                 NK         0.019240
                 AS         0.011593
                 F9         0.003700
Name: AIRLINE, dtype: float64

更多

# 求飞行时间的0.25,0.5,0.75分位数
 In[90]: flights.groupby(cuts)['AIR_TIME'].quantile(q=[.25, .5, .75]).div(60).round(2)
Out[90]: DIST                  
         (-inf, 200.0]     0.25    0.43
                           0.50    0.50
                           0.75    0.57
         (200.0, 500.0]    0.25    0.77
                           0.50    0.92
                           0.75    1.05
         (500.0, 1000.0]   0.25    1.43
                           0.50    1.65
                           0.75    1.92
         (1000.0, 2000.0]  0.25    2.50
                           0.50    2.93
                           0.75    3.40
         (2000.0, inf]     0.25    4.30
                           0.50    4.70
                           0.75    5.03
         Name: AIR_TIME, dtype: float64
# unstack方法可以将内层的索引变为列名
 In[91]: labels=['Under an Hour', '1 Hour', '1-2 Hours', '2-4 Hours', '4+ Hours']
         cuts2 = pd.cut(flights['DIST'], bins=bins, labels=labels)
         flights.groupby(cuts2)['AIRLINE'].value_counts(normalize=True).round(3).unstack().style.highlight_max(axis=1)
Out[91]:

11. 计算城市之间的航班总数

 In[92]: flights = pd.read_csv('data/flights.csv')
         flights.head()
Out[92]:

# 求每两个城市间的航班总数
 In[93]: flights_ct = flights.groupby(['ORG_AIR', 'DEST_AIR']).size()
         flights_ct.head()
Out[93]: ORG_AIR  DEST_AIR
         ATL      ABE         31
                  ABQ         16
                  ABY         19
                  ACY          6
                  AEX         40
         dtype: int64
# 选出休斯顿(IAH)和亚特兰大(ATL)之间双方向的航班总数
 In[94]: flights_ct.loc[[('ATL', 'IAH'), ('IAH', 'ATL')]]
Out[94]: ORG_AIR  DEST_AIR
         ATL      IAH         121
         IAH      ATL         148
         dtype: int64
# 分别对每行按照出发地和目的地,按字母排序
 In[95]: flights_sort = flights[['ORG_AIR', 'DEST_AIR']].apply(sorted, axis=1)
         flights_sort.head()
Out[95]:

# 因为现在每行都是独立排序的,列名存在问题。对列重命名,然后再计算所有城市间的航班数
 In[96]: rename_dict = {'ORG_AIR':'AIR1','DEST_AIR':'AIR2'}
         flights_sort = flights_sort.rename(columns=rename_dict)
         flights_ct2 = flights_sort.groupby(['AIR1', 'AIR2']).size()
         flights_ct2.head()
Out[96]: AIR1  AIR2
         ABE   ATL     31
               ORD     24
         ABI   DFW     74
         ABQ   ATL     16
               DEN     46
         dtype: int64
# 找到亚特兰大和休斯顿之间的航班数
 In[97]: flights_ct2.loc[('ATL', 'IAH')]
Out[97]: 269
# 如果调换顺序,则会出错
 In[98]: flights_ct2.loc[('IAH', 'ATL')]
---------------------------------------------------------------------------
IndexingError                             Traceback (most recent call last)
<ipython-input-98-56147a7d0bb5> in <module>()
----> 1 flights_ct2.loc[('IAH', 'ATL')]

/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/indexing.py in __getitem__(self, key)
  1323             except (KeyError, IndexError):
  1324                 pass
-> 1325             return self._getitem_tuple(key)
  1326         else:
  1327             key = com._apply_if_callable(key, self.obj)

/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/indexing.py in _getitem_tuple(self, tup)
   839 
   840         # no multi-index, so validate all of the indexers
--> 841         self._has_valid_tuple(tup)
   842 
   843         # ugly hack for GH #836

/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/indexing.py in _has_valid_tuple(self, key)
   186         for i, k in enumerate(key):
   187             if i >= self.obj.ndim:
--> 188                 raise IndexingError('Too many indexers')
   189             if not self._has_valid_type(k, i):
   190                 raise ValueError("Location based indexing can only have [%s] "

IndexingError: Too many indexers

更多

# 用NumPy的sort函数可以大大提高速度
 In[99]: data_sorted = np.sort(flights[['ORG_AIR', 'DEST_AIR']])
         data_sorted[:10]
Out[99]: array([['LAX', 'SLC'],
                ['DEN', 'IAD'],
                ['DFW', 'VPS'],
                ['DCA', 'DFW'],
                ['LAX', 'MCI'],
                ['IAH', 'SAN'],
                ['DFW', 'MSY'],
                ['PHX', 'SFO'],
                ['ORD', 'STL'],
                ['IAH', 'SJC']], dtype=object)
# 重新用DataFrame构造器创建一个DataFrame,检测其是否与flights_sorted相等
 In[100]: flights_sort2 = pd.DataFrame(data_sorted, columns=['AIR1', 'AIR2'])
          fs_orig = flights_sort.rename(columns={'ORG_AIR':'AIR1', 'DEST_AIR':'AIR2'})
          flights_sort2.equals(fs_orig)
Out[100]: True
# 比较速度
 In[101]: %timeit flights_sort = flights[['ORG_AIR', 'DEST_AIR']].apply(sorted, axis=1)
          7.82 s ± 189 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

 In[102]: %%timeit
          data_sorted = np.sort(flights[['ORG_AIR', 'DEST_AIR']])
          flights_sort2 = pd.DataFrame(data_sorted, columns=['AIR1', 'AIR2'])
          10.9 ms ± 325 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

12. 找到持续最长的准时航班

# 创建一个Series
 In[103]: s = pd.Series([1, 1, 1, 0, 1, 1, 1, 0])
          s
Out[103]: 0    1
          1    1
          2    1
          3    0
          4    1
          5    1
          6    1
          7    0
          dtype: int64
# 累积求和
 In[104]: s1 = s.cumsum()
          s1
Out[104]: 0    1
          1    2
          2    3
          3    3
          4    4
          5    5
          6    6
          7    6
          dtype: int64
 In[105]: s.mul(s1).diff()
Out[105]: 0    NaN
          1    1.0
          2    1.0
          3   -3.0
          4    4.0
          5    1.0
          6    1.0
          7   -6.0
          dtype: float64
# 将所有非负值变为缺失值
 In[106]: s.mul(s1).diff().where(lambda x: x < 0)
Out[106]: 0    NaN
          1    NaN
          2    NaN
          3   -3.0
          4    NaN
          5    NaN
          6    NaN
          7   -6.0
          dtype: float64
 In[107]: s.mul(s1).diff().where(lambda x: x < 0).ffill().add(s1, fill_value=0)
Out[107]: 0    1.0
          1    2.0
          2    3.0
          3    0.0
          4    1.0
          5    2.0
          6    3.0
          7    0.0
          dtype: float64
# 创建一个准时的列 ON_TIME
 In[108]: flights = pd.read_csv('data/flights.csv')
          flights['ON_TIME'] = flights['ARR_DELAY'].lt(15).astype(int)
          flights[['AIRLINE', 'ORG_AIR', 'ON_TIME']].head(10)
Out[108]:

# 将之前的逻辑做成一个函数
 In[109]: def max_streak(s):
              s1 = s.cumsum()
              return s.mul(s1).diff().where(lambda x: x < 0) \
                      .ffill().add(s1, fill_value=0).max()
 In[110]: flights.sort_values(['MONTH', 'DAY', 'SCHED_DEP']) \
                 .groupby(['AIRLINE', 'ORG_AIR'])['ON_TIME'] \
                 .agg(['mean', 'size', max_streak]).round(2).head()
Out[110]:

更多

# 求最长的延误航班
 In[111]: def max_delay_streak(df):
              df = df.reset_index(drop=True)
              s = 1 - df['ON_TIME']
              s1 = s.cumsum()
              streak = s.mul(s1).diff().where(lambda x: x < 0) \
                        .ffill().add(s1, fill_value=0)
              last_idx = streak.idxmax()
              first_idx = last_idx - streak.max() + 1
              df_return = df.loc[[first_idx, last_idx], ['MONTH', 'DAY']]
              df_return['streak'] = streak.max()
              df_return.index = ['first', 'last']
              df_return.index.name='streak_row'
              return df_return
 In[112]: flights.sort_values(['MONTH', 'DAY', 'SCHED_DEP']) \
                 .groupby(['AIRLINE', 'ORG_AIR']) \
                 .apply(max_delay_streak) \
                 .sort_values(['streak','MONTH','DAY'], ascending=[False, True, True]).head(10)
Out[112]: