import pandas as pd

# https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html#categoricaldtype
from pandas.api.types import CategoricalDtype


!wget https://www.dropbox.com/s/jou3p1zdyvjmq4e/OP_DTL_RSRCH_PGYR2017_P06302020.csv


df_raw = pd.read_csv('OP_DTL_RSRCH_PGYR2017_P06302020.csv', low_memory=False)
df_raw.head()


drop_thresh = df_raw.shape[0]*.9
df = df_raw.dropna(thresh=drop_thresh, how='all', axis='columns').copy()


df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 673227 entries, 0 to 673226
Columns: 176 entries, Change_Type to Context_of_Research
dtypes: float64(34), int64(3), object(139)
memory usage: 904.0+ MB


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 673227 entries, 0 to 673226
Data columns (total 34 columns):
 #   Column                                                            Non-Null Count   Dtype
---  ------                                                            --------------   -----
 0   Change_Type                                                       673227 non-null  object
 1   Covered_Recipient_Type                                            673227 non-null  object
 2   Recipient_Primary_Business_Street_Address_Line1                   672568 non-null  object
 3   Recipient_City                                                    672568 non-null  object
 4   Recipient_State                                                   672008 non-null  object
 5   Recipient_Zip_Code                                                672008 non-null  object
 6   Recipient_Country                                                 672568 non-null  object
 7   Principal_Investigator_1_Profile_ID                               636770 non-null  float64
 8   Principal_Investigator_1_First_Name                               636770 non-null  object
 9   Principal_Investigator_1_Last_Name                                636770 non-null  object
 10  Principal_Investigator_1_Business_Street_Address_Line1            636770 non-null  object
 11  Principal_Investigator_1_City                                     636770 non-null  object
 12  Principal_Investigator_1_State                                    636749 non-null  object
 13  Principal_Investigator_1_Zip_Code                                 636749 non-null  object
 14  Principal_Investigator_1_Country                                  636770 non-null  object
 15  Principal_Investigator_1_Primary_Type                             636770 non-null  object
 16  Principal_Investigator_1_Specialty                                635907 non-null  object
 17  Principal_Investigator_1_License_State_code1                      636770 non-null  object
 18  Submitting_Applicable_Manufacturer_or_Applicable_GPO_Name         673227 non-null  object
 19  Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_ID       673227 non-null  int64
 20  Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_Name     673227 non-null  object
 21  Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_State    608591 non-null  object
 22  Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_Country  673227 non-null  object
 23  Related_Product_Indicator                                         673227 non-null  object
 24  Total_Amount_of_Payment_USDollars                                 673227 non-null  float64
 25  Date_of_Payment                                                   673227 non-null  object
 26  Form_of_Payment_or_Transfer_of_Value                              673227 non-null  object
 27  Preclinical_Research_Indicator                                    673227 non-null  object
 28  Delay_in_Publication_Indicator                                    673227 non-null  object
 29  Name_of_Study                                                     666425 non-null  object
 30  Dispute_Status_for_Publication                                    673227 non-null  object
 31  Record_ID                                                         673227 non-null  int64
 32  Program_Year                                                      673227 non-null  int64
 33  Payment_Publication_Date                                          673227 non-null  object
dtypes: float64(2), int64(3), object(29)
memory usage: 174.6+ MB


# from_records: создает объект DataFrame из структурированного массива

unique_counts = pd.DataFrame.from_records([(col, df[col].nunique()) for col in df.columns],
                                          columns=['Column_Name', 'Num_Unique']).sort_values(by=['Num_Unique'])


unique_counts


cols_to_exclude = ['Program_Year', 'Date_of_Payment', 'Payment_Publication_Date']

for col in df.columns:
    if df[col].nunique() < 700 and col not in cols_to_exclude:
        df[col] = df[col].astype('category')


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 673227 entries, 0 to 673226
Data columns (total 34 columns):
 #   Column                                                            Non-Null Count   Dtype
---  ------                                                            --------------   -----
 0   Change_Type                                                       673227 non-null  category
 1   Covered_Recipient_Type                                            673227 non-null  category
 2   Recipient_Primary_Business_Street_Address_Line1                   672568 non-null  object
 3   Recipient_City                                                    672568 non-null  object
 4   Recipient_State                                                   672008 non-null  category
 5   Recipient_Zip_Code                                                672008 non-null  object
 6   Recipient_Country                                                 672568 non-null  category
 7   Principal_Investigator_1_Profile_ID                               636770 non-null  float64
 8   Principal_Investigator_1_First_Name                               636770 non-null  object
 9   Principal_Investigator_1_Last_Name                                636770 non-null  object
 10  Principal_Investigator_1_Business_Street_Address_Line1            636770 non-null  object
 11  Principal_Investigator_1_City                                     636770 non-null  object
 12  Principal_Investigator_1_State                                    636749 non-null  category
 13  Principal_Investigator_1_Zip_Code                                 636749 non-null  object
 14  Principal_Investigator_1_Country                                  636770 non-null  category
 15  Principal_Investigator_1_Primary_Type                             636770 non-null  category
 16  Principal_Investigator_1_Specialty                                635907 non-null  category
 17  Principal_Investigator_1_License_State_code1                      636770 non-null  category
 18  Submitting_Applicable_Manufacturer_or_Applicable_GPO_Name         673227 non-null  category
 19  Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_ID       673227 non-null  category
 20  Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_Name     673227 non-null  category
 21  Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_State    608591 non-null  category
 22  Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_Country  673227 non-null  category
 23  Related_Product_Indicator                                         673227 non-null  category
 24  Total_Amount_of_Payment_USDollars                                 673227 non-null  float64
 25  Date_of_Payment                                                   673227 non-null  object
 26  Form_of_Payment_or_Transfer_of_Value                              673227 non-null  category
 27  Preclinical_Research_Indicator                                    673227 non-null  category
 28  Delay_in_Publication_Indicator                                    673227 non-null  category
 29  Name_of_Study                                                     666425 non-null  object
 30  Dispute_Status_for_Publication                                    673227 non-null  category
 31  Record_ID                                                         673227 non-null  int64
 32  Program_Year                                                      673227 non-null  int64
 33  Payment_Publication_Date                                          673227 non-null  object
dtypes: category(19), float64(2), int64(2), object(11)
memory usage: 91.9+ MB


# to_frame(): преобразует Series в DataFrame

df.groupby('Covered_Recipient_Type')['Total_Amount_of_Payment_USDollars'].sum().to_frame()


# расположение в списке задает будущий порядок сортировки категорий от меньшей к большей

cats_to_order = ["Non-covered Recipient Entity",
                 "Covered Recipient Teaching Hospital",
                 "Covered Recipient Physician",
                 "Non-covered Recipient Individual"]


covered_type = CategoricalDtype(categories=cats_to_order,
                                ordered=True) # учитывать порядок категорий
covered_type

CategoricalDtype(categories=['Non-covered Recipient Entity',
                  'Covered Recipient Teaching Hospital',
                  'Covered Recipient Physician',
                  'Non-covered Recipient Individual'],
                 ordered=True)


# https://pandas.pydata.org/docs/reference/api/pandas.Series.cat.reorder_categories.html

df['Covered_Recipient_Type'] = df['Covered_Recipient_Type'].cat.reorder_categories(cats_to_order, ordered=True)
df['Covered_Recipient_Type'][:3]

0    Covered Recipient Teaching Hospital
1    Covered Recipient Teaching Hospital
2    Covered Recipient Teaching Hospital
Name: Covered_Recipient_Type, dtype: category
Categories (4, object): ['Non-covered Recipient Entity' < 'Covered Recipient Teaching Hospital' < 'Covered Recipient Physician' < 'Non-covered Recipient Individual']


df.groupby('Covered_Recipient_Type')['Total_Amount_of_Payment_USDollars'].sum().to_frame()


df_raw_2 = pd.read_csv('OP_DTL_RSRCH_PGYR2017_P06302020.csv',
                       dtype={'Covered_Recipient_Type':covered_type},
                       low_memory=False)


%%timeit
df_raw.groupby('Covered_Recipient_Type')['Total_Amount_of_Payment_USDollars'].sum().to_frame()

70.5 ms ± 6.12 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


%%timeit
df.groupby('Covered_Recipient_Type')['Total_Amount_of_Payment_USDollars'].sum().to_frame()

3.97 ms ± 60 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


import pandas as pd
from pandas.api.types import CategoricalDtype


sales_1 = [{'account': 'Jones LLC', 'Status': 'Gold', 'Jan': 150, 'Feb': 200, 'Mar': 140},
           {'account': 'Alpha Co', 'Status': 'Gold', 'Jan': 200, 'Feb': 210, 'Mar': 215},
           {'account': 'Blue Inc',  'Status': 'Silver', 'Jan': 50,  'Feb': 90,  'Mar': 95 }]


df_1 = pd.DataFrame(sales_1)
df_1


status_type = CategoricalDtype(categories=['Silver', 'Gold'],
                               ordered=True)


df_1['Status'] = df_1['Status'].astype(status_type)


df_1


df_1['Status']

0      Gold
1      Gold
2    Silver
Name: Status, dtype: category
Categories (2, object): ['Silver' < 'Gold']


sales_2 = [{'account': 'Smith Co', 'Status': 'Silver', 'Jan': 100, 'Feb': 100, 'Mar': 70},
           {'account': 'Bingo', 'Status': 'Bronze', 'Jan': 310, 'Feb': 65, 'Mar': 80}]


df_2 = pd.DataFrame(sales_2)
df_2.head()


df_2['Status'] = df_2['Status'].astype(status_type)
df_2['Status']

0    Silver
1       NaN
Name: Status, dtype: category
Categories (2, object): ['Silver' < 'Gold']


df_2


df_2['Status']

0    Silver
1       NaN
Name: Status, dtype: category
Categories (2, object): ['Silver' < 'Gold']


sales_1 = [{'account': 'Jones LLC', 'Status': 'Gold', 'Jan': 150, 'Feb': 200, 'Mar': 140},
           {'account': 'Alpha Co', 'Status': 'Gold', 'Jan': 200, 'Feb': 210, 'Mar': 215},
           {'account': 'Blue Inc',  'Status': 'Silver', 'Jan': 50,  'Feb': 90,  'Mar': 95 }]


df_1 = pd.DataFrame(sales_1)
df_1


# Определим неупорядоченную категорию
df_1['Status'] = df_1['Status'].astype('category')
df_1['Status']

0      Gold
1      Gold
2    Silver
Name: Status, dtype: category
Categories (2, object): ['Gold', 'Silver']


sales_2 = [{'account': 'Smith Co', 'Status': 'Silver', 'Jan': 100, 'Feb': 100, 'Mar': 70},
           {'account': 'Bingo', 'Status': 'Bronze', 'Jan': 310, 'Feb': 65, 'Mar': 80}]


df_2 = pd.DataFrame(sales_2)
df_2


df_2['Status'] = df_2['Status'].astype('category')
df_2['Status']

0    Silver
1    Bronze
Name: Status, dtype: category
Categories (2, object): ['Bronze', 'Silver']


# Объединим два кадра данных в 1
df_combined = pd.concat([df_1, df_2])


df_combined


df_combined['Status']

0      Gold
1      Gold
2    Silver
0    Silver
1    Bronze
Name: Status, dtype: object

Использование типа данных категории в pandas¶

Введение¶

Тип данных Category в pandas¶

Подготовка данных¶

Производительность¶

Осторожно¶

Общие рекомендации¶

	Change_Type	Covered_Recipient_Type	Noncovered_Recipient_Entity_Name	Teaching_Hospital_CCN	Teaching_Hospital_ID	Teaching_Hospital_Name	Physician_Profile_ID	Physician_First_Name	Physician_Middle_Name	Physician_Last_Name	...	Preclinical_Research_Indicator	Delay_in_Publication_Indicator	Name_of_Study	Dispute_Status_for_Publication	Record_ID	Program_Year	Payment_Publication_Date	ClinicalTrials_Gov_Identifier	Research_Information_Link	Context_of_Research
0	UNCHANGED	Covered Recipient Teaching Hospital	NaN	410007.0	4819.0	RHODE ISLAND HOSPITAL	NaN	NaN	NaN	NaN	...	No	No	PALLASPALBOCICLIB COLLABORATIVE ADJUVANT STUDY...	No	501845079	2017	06/30/2020	NaN	NaN	NaN
1	UNCHANGED	Covered Recipient Teaching Hospital	NaN	390111.0	5027.0	HOSPITAL OF THE UNIV OF PENNA	NaN	NaN	NaN	NaN	...	No	No	An Open-Label, Single-Arm, Multicenter, Phase ...	No	506101597	2017	06/30/2020	NaN	NaN	10011004 C2D15 Dec 15 2016
2	UNCHANGED	Covered Recipient Teaching Hospital	NaN	10033.0	5681.0	UNIVERSITY OF ALABAMA HOSPITAL	NaN	NaN	NaN	NaN	...	No	No	PHASE 3 STUDY OF ANTI PDL1 WITH ABRAXANE IN TN...	No	485544131	2017	06/30/2020	NaN	NaN	NaN
3	UNCHANGED	Covered Recipient Teaching Hospital	NaN	490007.0	5507.0	SENTARA NORFOLK GENERAL HOSPITAL	NaN	NaN	NaN	NaN	...	No	No	QP ExCELs	No	509865461	2017	06/30/2020	NaN	NaN	NaN
4	UNCHANGED	Covered Recipient Teaching Hospital	NaN	520078.0	5350.0	ST. FRANCIS HOSPITAL	NaN	NaN	NaN	NaN	...	No	No	Dimethyl Fumarate (DMF) Observational Study	No	455803127	2017	06/30/2020	NaN	NaN	NaN

	Column_Name	Num_Unique
33	Payment_Publication_Date	1
28	Delay_in_Publication_Indicator	1
32	Program_Year	1
30	Dispute_Status_for_Publication	2
27	Preclinical_Research_Indicator	2
23	Related_Product_Indicator	2
26	Form_of_Payment_or_Transfer_of_Value	3
14	Principal_Investigator_1_Country	4
0	Change_Type	4
1	Covered_Recipient_Type	4
15	Principal_Investigator_1_Primary_Type	6
6	Recipient_Country	9
22	Applicable_Manufacturer_or_Applicable_GPO_Maki...	21
21	Applicable_Manufacturer_or_Applicable_GPO_Maki...	35
4	Recipient_State	54
17	Principal_Investigator_1_License_State_code1	54
12	Principal_Investigator_1_State	55
16	Principal_Investigator_1_Specialty	244
25	Date_of_Payment	365
18	Submitting_Applicable_Manufacturer_or_Applicab...	569
19	Applicable_Manufacturer_or_Applicable_GPO_Maki...	637
20	Applicable_Manufacturer_or_Applicable_GPO_Maki...	649
11	Principal_Investigator_1_City	4209
3	Recipient_City	4454
8	Principal_Investigator_1_First_Name	8639
5	Recipient_Zip_Code	13901
13	Principal_Investigator_1_Zip_Code	14406
29	Name_of_Study	14460
9	Principal_Investigator_1_Last_Name	22355
10	Principal_Investigator_1_Business_Street_Addre...	30726
7	Principal_Investigator_1_Profile_ID	31221
2	Recipient_Primary_Business_Street_Address_Line1	41664
24	Total_Amount_of_Payment_USDollars	156164
31	Record_ID	673227

	Total_Amount_of_Payment_USDollars
Covered_Recipient_Type
Covered Recipient Physician	1.037440e+08
Covered Recipient Teaching Hospital	1.140148e+09
Non-covered Recipient Entity	4.009361e+09
Non-covered Recipient Individual	3.200450e+06

	account	Status	Jan	Feb	Mar
0	Jones LLC	Gold	150	200	140
1	Alpha Co	Gold	200	210	215
2	Blue Inc	Silver	50	90	95