import pandas as pd


df = pd.read_excel("https://github.com/chris1610/pbpython/blob/master/data/sample-salesv3.xlsx?raw=True")
df.head()


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   account number  1500 non-null   int64
 1   name            1500 non-null   object
 2   sku             1500 non-null   object
 3   quantity        1500 non-null   int64
 4   unit price      1500 non-null   float64
 5   ext price       1500 non-null   float64
 6   date            1500 non-null   object
dtypes: float64(2), int64(2), object(3)
memory usage: 82.2+ KB


df["date"] = pd.to_datetime(df['date'])


df.dtypes

account number             int64
name                      object
sku                       object
quantity                   int64
unit price               float64
ext price                float64
date              datetime64[ns]
dtype: object


df.set_index('date').resample('M')["ext price"].sum()

date
2014-01-31    185361.66
2014-02-28    146211.62
2014-03-31    203921.38
2014-04-30    174574.11
2014-05-31    165418.55
2014-06-30    174089.33
2014-07-31    191662.11
2014-08-31    153778.59
2014-09-30    168443.17
2014-10-31    171495.32
2014-11-30    119961.22
2014-12-31    163867.26
Freq: M, Name: ext price, dtype: float64


df.set_index('date').groupby('name')["ext price"].resample("M").sum()

name        date
Barton LLC  2014-01-31     6177.57
            2014-02-28    12218.03
            2014-03-31     3513.53
            2014-04-30    11474.20
            2014-05-31    10220.17
                            ...
Will LLC    2014-08-31     1439.82
            2014-09-30     4345.99
            2014-10-31     7085.33
            2014-11-30     3210.44
            2014-12-31    12561.21
Name: ext price, Length: 240, dtype: float64


df.groupby(['name', pd.Grouper(key='date', freq='M')])['ext price'].sum()

name        date
Barton LLC  2014-01-31     6177.57
            2014-02-28    12218.03
            2014-03-31     3513.53
            2014-04-30    11474.20
            2014-05-31    10220.17
                            ...
Will LLC    2014-08-31     1439.82
            2014-09-30     4345.99
            2014-10-31     7085.33
            2014-11-30     3210.44
            2014-12-31    12561.21
Name: ext price, Length: 240, dtype: float64


df.groupby(['name', pd.Grouper(key='date', freq='A-DEC')])['ext price'].sum()

name                             date
Barton LLC                       2014-12-31    109438.50
Cronin, Oberbrunner and Spencer  2014-12-31     89734.55
Frami, Hills and Schmidt         2014-12-31    103569.59
Fritsch, Russel and Anderson     2014-12-31    112214.71
Halvorson, Crona and Champlin    2014-12-31     70004.36
Herman LLC                       2014-12-31     82865.00
Jerde-Hilpert                    2014-12-31    112591.43
Kassulke, Ondricka and Metz      2014-12-31     86451.07
Keeling LLC                      2014-12-31    100934.30
Kiehn-Spinka                     2014-12-31     99608.77
Koepp Ltd                        2014-12-31    103660.54
Kuhn-Gusikowski                  2014-12-31     91094.28
Kulas Inc                        2014-12-31    137351.96
Pollich LLC                      2014-12-31     87347.18
Purdy-Kunde                      2014-12-31     77898.21
Sanford and Sons                 2014-12-31     98822.98
Stokes LLC                       2014-12-31     91535.92
Trantow-Barrows                  2014-12-31    123381.38
White-Trantow                    2014-12-31    135841.99
Will LLC                         2014-12-31    104437.60
Name: ext price, dtype: float64


df[["ext price", "quantity"]].sum()

ext price    2018784.32
quantity       36463.00
dtype: float64


df["unit price"].mean()

55.00752666666659


df[["ext price", "quantity", "unit price"]].agg(['sum', 'mean'])


df.agg({'ext price': ['sum', 'mean'], 'quantity': ['sum', 'mean'], 'unit price': ['mean']})


get_max = lambda x: x.value_counts(dropna=False).index[0]


df.agg({'ext price': ['sum', 'mean'], 'quantity': ['sum', 'mean'], 'unit price': ['mean'], 'sku': [get_max]})


get_max.__name__ = "most frequent"


df.agg({'ext price': ['sum', 'mean'], 'quantity': ['sum', 'mean'], 'unit price': ['mean'], 'sku': [get_max]})


import collections
f = collections.OrderedDict([('ext price', ['sum', 'mean']), ('quantity', ['sum', 'mean']), ('sku', [get_max])])
df.agg(f)

	account number	name	sku	quantity	unit price	ext price	date
0	740150	Barton LLC	B1-20000	39	86.69	3380.91	2014-01-01 07:21:51
1	714466	Trantow-Barrows	S2-77896	-1	63.16	-63.16	2014-01-01 10:00:47
2	218895	Kulas Inc	B1-69924	23	90.70	2086.10	2014-01-01 13:24:58
3	307599	Kassulke, Ondricka and Metz	S1-65481	41	21.05	863.05	2014-01-01 15:05:22
4	412290	Jerde-Hilpert	S2-34077	6	83.21	499.26	2014-01-01 23:26:55

	ext price	quantity	unit price
sum	2.018784e+06	36463.000000	82511.290000
mean	1.345856e+03	24.308667	55.007527

	ext price	quantity	unit price
mean	1.345856e+03	24.308667	55.007527
sum	2.018784e+06	36463.000000	NaN

	ext price	quantity	unit price	sku
<lambda>	NaN	NaN	NaN	S2-77896
mean	1.345856e+03	24.308667	55.007527	NaN
sum	2.018784e+06	36463.000000	NaN	NaN

	ext price	quantity	unit price	sku
mean	1.345856e+03	24.308667	55.007527	NaN
most frequent	NaN	NaN	NaN	S2-77896
sum	2.018784e+06	36463.000000	NaN	NaN

Объяснение функций Grouper и Agg в Pandas¶

Введение¶

Группировка данных временных рядов¶

Новая и улучшенная агрегатная функция¶

Заключение¶