import pandas as pd
import numpy as np


df = pd.read_csv("https://github.com/dm-fedorov/pandas_basic/blob/master/%D0%B1%D1%8B%D1%81%D1%82%D1%80%D0%BE%D0%B5%20%D0%B2%D0%B2%D0%B5%D0%B4%D0%B5%D0%BD%D0%B8%D0%B5%20%D0%B2%20pandas/data/sales_data_types.csv?raw=True")

df


df['2016'] + df['2017']

0      $125,000.00$162500.00
1    $920,000.00$101,2000.00
2        $50,000.00$62500.00
3      $350,000.00$490000.00
4        $15,000.00$12750.00
dtype: object


df.dtypes

Customer Number    float64
Customer Name       object
2016                object
2017                object
Percent Growth      object
Jan Units           object
Month                int64
Day                  int64
Year                 int64
Active              object
dtype: object


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   Customer Number  5 non-null      float64
 1   Customer Name    5 non-null      object
 2   2016             5 non-null      object
 3   2017             5 non-null      object
 4   Percent Growth   5 non-null      object
 5   Jan Units        5 non-null      object
 6   Month            5 non-null      int64
 7   Day              5 non-null      int64
 8   Year             5 non-null      int64
 9   Active           5 non-null      object
dtypes: float64(1), int64(3), object(6)
memory usage: 528.0+ bytes


df['Customer Number'].astype('int') # pandas понимает, что в итоге нужен int64

0     10002
1    552278
2     23477
3     24900
4    651029
Name: Customer Number, dtype: int64


df["Customer Number"] = df['Customer Number'].astype('int')
df.dtypes

Customer Number     int64
Customer Name      object
2016               object
2017               object
Percent Growth     object
Jan Units          object
Month               int64
Day                 int64
Year                int64
Active             object
dtype: object

df


# здесь появится исключение:

# df['2016'].astype('float')


# здесь тоже появится исключение:

# df['Jan Units'].astype('int')


df['Active'].astype('bool')

0    True
1    True
2    True
3    True
4    True
Name: Active, dtype: bool


df.astype({'Customer Number': 'int', 'Customer Name': 'str'}).dtypes

Customer Number     int64
Customer Name      object
2016               object
2017               object
Percent Growth     object
Jan Units          object
Month               int64
Day                 int64
Year                int64
Active             object
dtype: object


def convert_currency(val):
    """
    Преобразует числовое значение строки в число с плавающей точкой:
     - удаляет $
     - удаляет запятые
     - преобразует в число с плавающей точкой
    """
    new_val = val.replace(',', '').replace('$', '')
    return float(new_val)


df['2016'].apply(convert_currency)

0    125000.0
1    920000.0
2     50000.0
3    350000.0
4     15000.0
Name: 2016, dtype: float64


df['2016'].apply(lambda x: x.replace('$', '').replace(',', '')).astype('float')

0    125000.0
1    920000.0
2     50000.0
3    350000.0
4     15000.0
Name: 2016, dtype: float64


df['2016'] = df['2016'].apply(convert_currency)
df['2017'] = df['2017'].apply(convert_currency)

df.dtypes

Customer Number      int64
Customer Name       object
2016               float64
2017               float64
Percent Growth      object
Jan Units           object
Month                int64
Day                  int64
Year                 int64
Active              object
dtype: object


df['Percent Growth'].apply(lambda x: x.replace('%', '')).astype('float') / 100

0    0.30
1    0.10
2    0.25
3    0.04
4   -0.15
Name: Percent Growth, dtype: float64


def convert_percent(val):
    """
    Преобразование процентной строки в фактический процент с плавающей точкой:
     - Удаляет %
     - Делит на 100, чтобы получить десятичную дробь
    """
    new_val = val.replace('%', '')
    return float(new_val) / 100


df['Percent Growth'].apply(convert_percent)

0    0.30
1    0.10
2    0.25
3    0.04
4   -0.15
Name: Percent Growth, dtype: float64


df["Active"] = np.where(df["Active"] == "Y", True, False)

df


df.dtypes

Customer Number      int64
Customer Name       object
2016               float64
2017               float64
Percent Growth      object
Jan Units           object
Month                int64
Day                  int64
Year                 int64
Active                bool
dtype: object


pd.to_numeric(df['Jan Units'], errors='coerce')

0    500.0
1    700.0
2    125.0
3     75.0
4      NaN
Name: Jan Units, dtype: float64


pd.to_numeric(df['Jan Units'], errors='coerce').fillna(0)

0    500.0
1    700.0
2    125.0
3     75.0
4      0.0
Name: Jan Units, dtype: float64


pd.to_datetime(df[['Month', 'Day', 'Year']])

0   2015-01-10
1   2014-06-15
2   2016-03-29
3   2015-10-27
4   2014-02-02
dtype: datetime64[ns]


df["Start_Date"] = pd.to_datetime(df[['Month', 'Day', 'Year']])
df["Jan Units"] = pd.to_numeric(df['Jan Units'], errors='coerce').fillna(0)

df


df.dtypes

Customer Number             int64
Customer Name              object
2016                      float64
2017                      float64
Percent Growth             object
Jan Units                 float64
Month                       int64
Day                         int64
Year                        int64
Active                       bool
Start_Date         datetime64[ns]
dtype: object


df_2 = pd.read_csv("https://github.com/dm-fedorov/pandas_basic/blob/master/%D0%B1%D1%8B%D1%81%D1%82%D1%80%D0%BE%D0%B5%20%D0%B2%D0%B2%D0%B5%D0%B4%D0%B5%D0%BD%D0%B8%D0%B5%20%D0%B2%20pandas/data/sales_data_types.csv?raw=True",
                   dtype={'Customer Number': 'int'},
                   converters={'2016': convert_currency,
                               '2017': convert_currency,
                               'Percent Growth': convert_percent,
                               'Jan Units': lambda x: pd.to_numeric(x, errors='coerce'),
                               'Active': lambda x: np.where(x == "Y", True, False)
                              })


df_2


df_2.dtypes

Customer Number      int64
Customer Name       object
2016               float64
2017               float64
Percent Growth     float64
Jan Units          float64
Month                int64
Day                  int64
Year                 int64
Active              object
dtype: object

Pandas	Python	NumPy	Использование
object	str или смесь	string, unicode, смешанные типы	Текстовые или смешанные числовые и нечисловые значения
int64	int	int_, int8, int16, int32, int64, uint8, uint16, uint32, uint64	Целые числа
float64	float	float_, float16, float32, float64	Числа с плавающей точкой
bool	bool	bool_	Значения True/False
datetime64	datetime	datetime64[ns]	Значения даты и времени
timedelta[ns]	NA	NA	Разность между двумя datetimes
category	NA	NA	Ограниченный список текстовых значений

Обзор типов данных Pandas¶

Введение¶

Типы данных Pandas¶

Почему нас это волнует?¶

Использование функции astype()¶

Дополнительно¶

Пользовательские функции преобразования¶

Вспомогательные функции pandas¶

Собираем все вместе¶

Резюме¶

	Customer Number	Customer Name	2016	2017	Percent Growth	Jan Units	Month	Day	Year	Active
0	10002.0	Quest Industries	$125,000.00	$162500.00	30.00%	500	1	10	2015	Y
1	552278.0	Smith Plumbing	$920,000.00	$101,2000.00	10.00%	700	6	15	2014	Y
2	23477.0	ACME Industrial	$50,000.00	$62500.00	25.00%	125	3	29	2016	Y
3	24900.0	Brekke LTD	$350,000.00	$490000.00	4.00%	75	10	27	2015	Y
4	651029.0	Harbor Co	$15,000.00	$12750.00	-15.00%	Closed	2	2	2014	N

	Customer Number	Customer Name	2016	2017	Percent Growth	Jan Units	Month	Day	Year	Active
0	10002	Quest Industries	$125,000.00	$162500.00	30.00%	500	1	10	2015	Y
1	552278	Smith Plumbing	$920,000.00	$101,2000.00	10.00%	700	6	15	2014	Y
2	23477	ACME Industrial	$50,000.00	$62500.00	25.00%	125	3	29	2016	Y
3	24900	Brekke LTD	$350,000.00	$490000.00	4.00%	75	10	27	2015	Y
4	651029	Harbor Co	$15,000.00	$12750.00	-15.00%	Closed	2	2	2014	N

	Customer Number	Customer Name	2016	2017	Percent Growth	Jan Units	Month	Day	Year	Active
0	10002	Quest Industries	125000.0	162500.0	30.00%	500	1	10	2015	True
1	552278	Smith Plumbing	920000.0	1012000.0	10.00%	700	6	15	2014	True
2	23477	ACME Industrial	50000.0	62500.0	25.00%	125	3	29	2016	True
3	24900	Brekke LTD	350000.0	490000.0	4.00%	75	10	27	2015	True
4	651029	Harbor Co	15000.0	12750.0	-15.00%	Closed	2	2	2014	False