# импортируем необходимые модули:
import pandas as pd
import numpy as np
import seaborn as sns

# добавляем в графики красивости seaborn:
sns.set_style('whitegrid')


raw_df = pd.read_excel('https://github.com/chris1610/pbpython/blob/master/data/2018_Sales_Total_v2.xlsx?raw=true')
raw_df.head()


df = raw_df.groupby(['account number', 'name'])['ext price'].sum().reset_index()
df.head()


df['ext price'].plot(kind='hist');


df['ext price'].describe()

count        20.000000
mean     101711.287500
std       27037.449673
min       55733.050000
25%       89137.707500
50%      100271.535000
75%      110132.552500
max      184793.700000
Name: ext price, dtype: float64


pd.qcut(df['ext price'], q=4)

0     (55733.049000000006, 89137.708]
1             (89137.708, 100271.535]
2     (55733.049000000006, 89137.708]
3              (110132.552, 184793.7]
4             (89137.708, 100271.535]
5             (89137.708, 100271.535]
6     (55733.049000000006, 89137.708]
7            (100271.535, 110132.552]
8              (110132.552, 184793.7]
9              (110132.552, 184793.7]
10            (89137.708, 100271.535]
11    (55733.049000000006, 89137.708]
12    (55733.049000000006, 89137.708]
13            (89137.708, 100271.535]
14           (100271.535, 110132.552]
15             (110132.552, 184793.7]
16           (100271.535, 110132.552]
17             (110132.552, 184793.7]
18           (100271.535, 110132.552]
19           (100271.535, 110132.552]
Name: ext price, dtype: category
Categories (4, interval[float64]): [(55733.049000000006, 89137.708] < (89137.708, 100271.535] < (100271.535, 110132.552] < (110132.552, 184793.7]]


df['quantile_ex_1'] = pd.qcut(df['ext price'], q=4)
df['quantile_ex_2'] = pd.qcut(df['ext price'], q=10, precision=0)

df.head()


df['quantile_ex_1'].value_counts()

(55733.049000000006, 89137.708]    5
(89137.708, 100271.535]            5
(100271.535, 110132.552]           5
(110132.552, 184793.7]             5
Name: quantile_ex_1, dtype: int64


df['quantile_ex_2'].value_counts()

(55732.0, 76471.0]      2
(76471.0, 87168.0]      2
(87168.0, 90686.0]      2
(90686.0, 95908.0]      2
(95908.0, 100272.0]     2
(100272.0, 103606.0]    2
(103606.0, 105938.0]    2
(105938.0, 112290.0]    2
(112290.0, 124778.0]    2
(124778.0, 184794.0]    2
Name: quantile_ex_2, dtype: int64


bin_labels_5 = ['Bronze', 'Silver', 'Gold', 'Platinum', 'Diamond']

df['quantile_ex_3'] = pd.qcut(df['ext price'],
                              q=[0, .2, .4, .6, .8, 1],
                              labels=bin_labels_5)
df.head()


df['quantile_ex_3'].value_counts()

Bronze      4
Silver      4
Gold        4
Platinum    4
Diamond     4
Name: quantile_ex_3, dtype: int64


# возвращается кортеж:
results, bin_edges = pd.qcut(df['ext price'],
                            q=[0, .2, .4, .6, .8, 1],
                            labels=bin_labels_5,
                            retbins=True)


# категориальная переменная:
results

0       Bronze
1         Gold
2       Bronze
3      Diamond
4       Silver
5       Silver
6       Bronze
7     Platinum
8      Diamond
9      Diamond
10        Gold
11      Bronze
12      Silver
13      Silver
14        Gold
15     Diamond
16    Platinum
17    Platinum
18    Platinum
19        Gold
Name: ext price, dtype: category
Categories (5, object): ['Bronze' < 'Silver' < 'Gold' < 'Platinum' < 'Diamond']


bin_edges

array([ 55733.05 ,  87167.958,  95908.156, 103605.97 , 112290.054,
       184793.7  ])


results_table = pd.DataFrame(zip(bin_edges, bin_labels_5),
                            columns=['Threshold', 'Tier'])
results_table


df.head()


df.describe(include='category')


df.describe(percentiles=[0, 1/3, 2/3, 1])


df['quantile_ex_4'] = pd.qcut(df['ext price'],
                              q=[0, .2, .4, .6, .8, 1],
                              labels=False,
                              precision=0)
df.head()


df = df.drop(columns = ['quantile_ex_1','quantile_ex_2',
                        'quantile_ex_3', 'quantile_ex_4'])
df.head()


pd.cut(df['ext price'], bins=4)

0       (55603.989, 87998.212]
1      (87998.212, 120263.375]
2       (55603.989, 87998.212]
3     (120263.375, 152528.538]
4      (87998.212, 120263.375]
5      (87998.212, 120263.375]
6       (55603.989, 87998.212]
7      (87998.212, 120263.375]
8      (87998.212, 120263.375]
9       (152528.538, 184793.7]
10     (87998.212, 120263.375]
11      (55603.989, 87998.212]
12      (55603.989, 87998.212]
13     (87998.212, 120263.375]
14     (87998.212, 120263.375]
15    (120263.375, 152528.538]
16     (87998.212, 120263.375]
17     (87998.212, 120263.375]
18     (87998.212, 120263.375]
19     (87998.212, 120263.375]
Name: ext price, dtype: category
Categories (4, interval[float64]): [(55603.989, 87998.212] < (87998.212, 120263.375] < (120263.375, 152528.538] < (152528.538, 184793.7]]


pd.cut(df['ext price'], bins=4).value_counts()

(87998.212, 120263.375]     12
(55603.989, 87998.212]       5
(120263.375, 152528.538]     2
(152528.538, 184793.7]       1
Name: ext price, dtype: int64


cut_labels_4 = ['silver', 'gold', 'platinum', 'diamond']
cut_bins = [0, 70000, 100000, 130000, 200000]

df['cut_ex1'] = pd.cut(df['ext price'],
                       bins=cut_bins,
                       labels=cut_labels_4)
df.head()


pd.cut(df['ext price'],
       bins=np.linspace(0, 200000, 9))

0       (50000.0, 75000.0]
1      (75000.0, 100000.0]
2      (75000.0, 100000.0]
3     (125000.0, 150000.0]
4      (75000.0, 100000.0]
5      (75000.0, 100000.0]
6      (75000.0, 100000.0]
7     (100000.0, 125000.0]
8     (100000.0, 125000.0]
9     (175000.0, 200000.0]
10     (75000.0, 100000.0]
11      (50000.0, 75000.0]
12     (75000.0, 100000.0]
13     (75000.0, 100000.0]
14    (100000.0, 125000.0]
15    (100000.0, 125000.0]
16    (100000.0, 125000.0]
17    (100000.0, 125000.0]
18    (100000.0, 125000.0]
19    (100000.0, 125000.0]
Name: ext price, dtype: category
Categories (8, interval[float64]): [(0.0, 25000.0] < (25000.0, 50000.0] < (50000.0, 75000.0] < (75000.0, 100000.0] < (100000.0, 125000.0] < (125000.0, 150000.0] < (150000.0, 175000.0] < (175000.0, 200000.0]]


pd.interval_range(start=0,
                  freq=10000,
                  end=200000,
                  closed='left')

IntervalIndex([[0, 10000), [10000, 20000), [20000, 30000), [30000, 40000), [40000, 50000) ... [150000, 160000), [160000, 170000), [170000, 180000), [180000, 190000), [190000, 200000)],
              closed='left',
              dtype='interval[int64]')


interval_range = pd.interval_range(start=0,
                                   freq=10000,
                                   end=200000)

df['cut_ex2'] = pd.cut(df['ext price'],
                       bins=interval_range,
                       labels=[1,2,3])
df.head()


df['ext price'].value_counts(bins=4,
                             sort=False)

(55603.988000000005, 87998.212]     5
(87998.212, 120263.375]            12
(120263.375, 152528.538]            2
(152528.538, 184793.7]              1
Name: ext price, dtype: int64

	account number	name	sku	quantity	unit price	ext price	date
0	740150	Barton LLC	B1-20000	39	86.69	3380.91	2018-01-01 07:21:51.000002
1	714466	Trantow-Barrows	S2-77896	-1	63.16	-63.16	2018-01-01 10:00:47.000004
2	218895	Kulas Inc	B1-69924	23	90.70	2086.10	2018-01-01 13:24:57.999997
3	307599	Kassulke, Ondricka and Metz	S1-65481	41	21.05	863.05	2018-01-01 15:05:21.999995
4	412290	Jerde-Hilpert	S2-34077	6	83.21	499.26	2018-01-01 23:26:55.000003

	account number	name	ext price
0	141962	Herman LLC	63626.03
1	146832	Kiehn-Spinka	99608.77
2	163416	Purdy-Kunde	77898.21
3	218895	Kulas Inc	137351.96
4	239344	Stokes LLC	91535.92

	account number	name	ext price
0	141962	Herman LLC	63626.03
1	146832	Kiehn-Spinka	99608.77
2	163416	Purdy-Kunde	77898.21
3	218895	Kulas Inc	137351.96
4	239344	Stokes LLC	91535.92

	account number	name	ext price	cut_ex1
0	141962	Herman LLC	63626.03	silver
1	146832	Kiehn-Spinka	99608.77	gold
2	163416	Purdy-Kunde	77898.21	gold
3	218895	Kulas Inc	137351.96	diamond
4	239344	Stokes LLC	91535.92	gold

	account number	name	ext price	cut_ex1	cut_ex2
0	141962	Herman LLC	63626.03	silver	(60000, 70000]
1	146832	Kiehn-Spinka	99608.77	gold	(90000, 100000]
2	163416	Purdy-Kunde	77898.21	gold	(70000, 80000]
3	218895	Kulas Inc	137351.96	diamond	(130000, 140000]
4	239344	Stokes LLC	91535.92	gold	(90000, 100000]

Разделение (биннинг, дискретизация, балансировка) данных с помощью qcut и cut в Pandas¶

Введение¶

Биннинг (binning)¶

qcut¶

cut¶

Заключение¶

	Threshold	Tier
0	55733.050	Bronze
1	87167.958	Silver
2	95908.156	Gold
3	103605.970	Platinum
4	112290.054	Diamond

	quantile_ex_1	quantile_ex_2	quantile_ex_3
count	20	20	20
unique	4	10	5
top	(55733.049000000006, 89137.708]	(55732.0, 76471.0]	Bronze
freq	5	2	4

	account number	ext price
count	20.000000	20.000000
mean	476998.750000	101711.287500
std	231499.208970	27037.449673
min	141962.000000	55733.050000
0%	141962.000000	55733.050000
33.3%	332759.333333	91241.493333
50%	476006.500000	100271.535000
66.7%	662511.000000	104178.580000
100%	786968.000000	184793.700000
max	786968.000000	184793.700000