#!pip3 install sidetable


import pandas as pd
import numpy as np
import sidetable


!wget https://www.dropbox.com/s/9e88whmc03nkouz/2019_Iowa_Liquor_Sales.csv


df = pd.read_csv('2019_Iowa_Liquor_Sales.csv')


df.head()


df.stb.freq(['Store Name'], value='Sale (Dollars)', style=True, cum_cols=False)


df.loc[df['Store Name'].str.contains('Hy-Vee', case=False), 'Store_Group_1'] = 'Hy-Vee'


%timeit df.loc[df['Store Name'].str.contains('Hy-Vee', case=False), 'Store_Group_1'] = 'Hy-Vee'

1.52 s ± 11.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


%timeit df.loc[df['Store Name'].str.contains('Hy-Vee', case=False, regex=False), 'Store_Group_1'] = 'Hy-Vee'

811 ms ± 5.96 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


df['Store_Group_1'].value_counts(dropna=False)

NaN       1617777
Hy-Vee     762568
Name: Store_Group_1, dtype: int64


store_patterns = [
    (df['Store Name'].str.contains('Hy-Vee', case=False, regex=False), 'Hy-Vee'),
    (df['Store Name'].str.contains('Central City', case=False,  regex=False), 'Central City'),
    (df['Store Name'].str.contains("Smokin' Joe's", case=False,  regex=False), "Smokin' Joe's"),
    (df['Store Name'].str.contains('Walmart|Wal-Mart', case=False), 'Wal-Mart'),
    (df['Store Name'].str.contains('Fareway Stores', case=False,  regex=False), 'Fareway Stores'),
    (df['Store Name'].str.contains("Casey's", case=False,  regex=False), "Casey's General Store"),
    (df['Store Name'].str.contains("Sam's Club", case=False,  regex=False), "Sam's Club"),
    (df['Store Name'].str.contains('Kum & Go', regex=False, case=False), 'Kum & Go'),
    (df['Store Name'].str.contains('CVS', regex=False, case=False), 'CVS Pharmacy'),
    (df['Store Name'].str.contains('Walgreens', regex=False, case=False), 'Walgreens'),
    (df['Store Name'].str.contains('Yesway', regex=False, case=False), 'Yesway Store'),
    (df['Store Name'].str.contains('Target Store', regex=False, case=False), 'Target'),
    (df['Store Name'].str.contains('Quik Trip', regex=False, case=False), 'Quik Trip'),
    (df['Store Name'].str.contains('Circle K', regex=False, case=False), 'Circle K'),
    (df['Store Name'].str.contains('Hometown Foods', regex=False, case=False), 'Hometown Foods'),
    (df['Store Name'].str.contains("Bucky's", case=False, regex=False), "Bucky's Express"),
    (df['Store Name'].str.contains('Kwik', case=False, regex=False), 'Kwik Shop')
]


store_criteria, store_values = zip(*store_patterns)
df['Store_Group_1'] = np.select(store_criteria, store_values, 'other')


df.stb.freq(['Store_Group_1'], value='Sale (Dollars)', style=True, cum_cols=False)


df['Store_Group_1'] = np.select(store_criteria, store_values, None)
df['Store_Group_1'] = df['Store_Group_1'].combine_first(df['Store Name'])


df.stb.freq(['Store_Group_1'], value='Sale (Dollars)', style=True, cum_cols=False)


def generalize(ser, match_name, default=None, regex=False, case=False):
    """ Поиск в серии текстовых совпадений.
    На основе кода из https://www.metasnake.com/blog/pydata-assign.html

    ser: серии pandas для поиска 
    match_name: кортеж, содержащий текст для поиска и текст для нормализации
    default: Если совпадений нет, используйте это, чтобы указать значение по умолчанию, 
    в противном случае используйте оригинальный текст
    regex: Логическое значение, указывающее, содержит ли match_name регулярное выражение
    case: Поиск с учетом регистра

    Возвращает серию pandas с совпадающим значением
    """
    seen = None
    for match, name in match_name:
        mask = ser.str.contains(match, case=case, regex=regex)
        if seen is None:
            seen = mask
        else:
            seen |= mask
        ser = ser.where(~mask, name)
    if default:
        ser = ser.where(seen, default)
    else:
        ser = ser.where(seen, ser.values)
    return ser


store_patterns_2 = [('Hy-Vee', 'Hy-Vee'),
                    ("Smokin' Joe's", "Smokin' Joe's"),
                    ('Central City', 'Central City'),
                    ('Costco Wholesale', 'Costco Wholesale'),
                    ('Walmart', 'Walmart'),
                    ('Wal-Mart', 'Walmart'),
                    ('Fareway Stores', 'Fareway Stores'),
                    ("Casey's", "Casey's General Store"),
                    ("Sam's Club", "Sam's Club"),
                    ('Kum & Go', 'Kum & Go'),
                    ('CVS', 'CVS Pharmacy'),
                    ('Walgreens', 'Walgreens'),
                    ('Yesway', 'Yesway Store'),
                    ('Target Store', 'Target'),
                    ('Quik Trip', 'Quik Trip'),
                    ('Circle K', 'Circle K'),
                    ('Hometown Foods', 'Hometown Foods'),
                    ("Bucky's", "Bucky's Express"),
                    ('Kwik', 'Kwik Shop')]


df['Store_Group_2'] = generalize(df['Store Name'], store_patterns_2)


df['Store Name'] = df['Store Name'].astype('category')


df['Store_Group_3'] = np.select(store_criteria, store_values, None)
df['Store_Group_3'] = df['Store_Group_1'].combine_first(df['Store Name'])


df['Store Name'] = df['Store Name'].astype('string')


df.head()


lookup_df = pd.DataFrame()
lookup_df['Store Name'] = df['Store Name'].unique()
lookup_df['Store_Group_5'] = generalize(lookup_df['Store Name'], store_patterns_2)


lookup_df.head()


df = pd.merge(df, lookup_df, how='left')

	Invoice/Item Number	Date	Store Number	Store Name	Address	City	Zip Code	Store Location	County Number	County	...	Item Number	Item Description	Pack	Bottle Volume (ml)	State Bottle Cost	State Bottle Retail	Bottles Sold	Sale (Dollars)	Volume Sold (Liters)	Volume Sold (Gallons)
0	INV-16681900011	01/02/2019	5286	Sauce	108, College	Iowa City	52240.0	NaN	52.0	JOHNSON	...	48099	Hennessy VS	24	200	6.24	9.36	24	224.64	4.8	1.26
1	INV-16681900027	01/02/2019	5286	Sauce	108, College	Iowa City	52240.0	NaN	52.0	JOHNSON	...	89191	Jose Cuervo Especial Reposado Mini	12	500	11.50	17.25	12	207.00	6.0	1.58
2	INV-16681900018	01/02/2019	5286	Sauce	108, College	Iowa City	52240.0	NaN	52.0	JOHNSON	...	8824	Lauder's	24	375	3.21	4.82	24	115.68	9.0	2.37
3	INV-16685400036	01/02/2019	2524	Hy-Vee Food Store / Dubuque	3500 Dodge St	Dubuque	52001.0	NaN	31.0	DUBUQUE	...	35917	Five O'Clock Vodka	12	1000	4.17	6.26	12	75.12	12.0	3.17
4	INV-16690300035	01/02/2019	4449	Kum & Go #121 / Urbandale	12041 Douglas Pkwy	Urbandale	50322.0	NaN	77.0	POLK	...	36304	Hawkeye Vodka	24	375	1.86	2.79	24	66.96	9.0	2.37

	Store_Group_1	Sale (Dollars)	percent
0	Hy-Vee	126,265,195	36.16%
1	other	112,733,367	32.28%
2	Fareway Stores	23,146,939	6.63%
3	Wal-Mart	22,641,682	6.48%
4	Sam's Club	19,604,085	5.61%
5	Central City	14,108,944	4.04%
6	Casey's General Store	11,351,935	3.25%
7	Kum & Go	6,019,449	1.72%
8	Walgreens	2,942,270	0.84%
9	Target	2,904,611	0.83%
10	Smokin' Joe's	2,049,536	0.59%
11	Kwik Shop	1,431,142	0.41%
12	Quik Trip	1,140,374	0.33%
13	CVS Pharmacy	795,303	0.23%
14	Hometown Foods	787,840	0.23%
15	Yesway Store	741,863	0.21%
16	Bucky's Express	465,757	0.13%
17	Circle K	90,049	0.03%

	Invoice/Item Number	Date	Store Number	Store Name	Address	City	Zip Code	Store Location	County Number	County	...	Bottle Volume (ml)	State Bottle Cost	State Bottle Retail	Bottles Sold	Sale (Dollars)	Volume Sold (Liters)	Volume Sold (Gallons)	Store_Group_1	Store_Group_2	Store_Group_3
0	INV-16681900011	01/02/2019	5286	Sauce	108, College	Iowa City	52240.0	NaN	52.0	JOHNSON	...	200	6.24	9.36	24	224.64	4.8	1.26	Sauce	Sauce	Sauce
1	INV-16681900027	01/02/2019	5286	Sauce	108, College	Iowa City	52240.0	NaN	52.0	JOHNSON	...	500	11.50	17.25	12	207.00	6.0	1.58	Sauce	Sauce	Sauce
2	INV-16681900018	01/02/2019	5286	Sauce	108, College	Iowa City	52240.0	NaN	52.0	JOHNSON	...	375	3.21	4.82	24	115.68	9.0	2.37	Sauce	Sauce	Sauce
3	INV-16685400036	01/02/2019	2524	Hy-Vee Food Store / Dubuque	3500 Dodge St	Dubuque	52001.0	NaN	31.0	DUBUQUE	...	1000	4.17	6.26	12	75.12	12.0	3.17	Hy-Vee	Hy-Vee	Hy-Vee
4	INV-16690300035	01/02/2019	4449	Kum & Go #121 / Urbandale	12041 Douglas Pkwy	Urbandale	50322.0	NaN	77.0	POLK	...	375	1.86	2.79	24	66.96	9.0	2.37	Kum & Go	Kum & Go	Kum & Go

	Store Name	Store_Group_5
0	Sauce	Sauce
1	Hy-Vee Food Store / Dubuque	Hy-Vee
2	Kum & Go #121 / Urbandale	Kum & Go
3	IDA Liquor	IDA Liquor
4	Lake View Foods	Lake View Foods

Решение	Время исполнения	Примечания
`np.select`	`13 с`	Может работать для нетекстового анализа
`generalize`	`15 с`	Только текст
Категориальные данные и `np.select`	`786 мс`	Категориальные данные могут быть сложными при merging и joining
Таблица поиска и `generalize`	`1.3 с`	Таблица поиска может поддерживаться кем-то другим

Эффективная очистка текста с помощью Pandas¶

Вступление¶

Проблема¶

Данные¶

Попытка очистки №1¶

Попытка очистки №2¶

Попытка очистки №3¶

А как насчет типов данных?¶

Таблица поиска¶

Резюме¶