Python Pandas

1. Install

1# Pip
2pip install pandas
3# Conda
4conda install -c conda-forge pandas
5conda install conda-forge::pandas

2. How to

2.1. Convert String to DateTime

1df_db['ds'] = pd.to_datetime(df_db['ds'], format='%Y%m%d')

2.2. Re-Index DateTime series

 1import pandas as pd
 2# By Merge with new datetime series
 3# df is incomplete series data, which contains a column 'ds' as datetime series. 
 4idx = pd.date_range(start = '2023-02-01', end = '2023-05-31', freq = '1D')
 5df_ds = pd.DataFrame({ 'ds': idx })
 6df_merged = pd.merge(df, df_ds, on = 'ds', how = 'outer').sort_values('ds')
 7# Fill 0 for Nan value of y2
 8df_merged['y2'] = df_merged['y'].fillna(value = 0)
 9
10# By Reindex origin ds column
11idx = pd.date_range(start = '2023-02-01', end = '2023-05-31', freq = '1D')
12df = df.set_index('ds') # set the DataFrame index by 'ds' column
13df = df.reindex(idx, fill_value = 0) # reindex the DataFrame by idx(datetime series generated), and fill 0 for Nan value
14df = df.reset_index() # reset the DataFrame index by number sequence
15df.columns = ['ds', 'y'] # reset the DataFrame name
16
17# For the perf reason, inplace should be use firstly.
18df.set_index('ds', inplace = True)
19df = df.reindex(idx, fill_value = 0)
20df.reset_index(inplace = True)
21df.columns = ['ds', 'y']
22df.head()

2.3. Convert Horizontal table as Vertical table

Some time data table looks like below, the values at columns which starts with “d_”, convert the records as rows view.

id	item_id	dept_id	cat_id	store_id	state_id	…	d_1939	d_1940	d_1941
HOBBIES_1_001_CA_1_evaluation	HOBBIES_1_001	HOBBIES_1	HOBBIES	CA_1	CA	…	2	4	0
HOBBIES_1_002_CA_1_evaluation	HOBBIES_1_002	HOBBIES_1	HOBBIES	CA_1	CA	…	0	1	2
HOBBIES_1_003_CA_1_evaluation	HOBBIES_1_003	HOBBIES_1	HOBBIES	CA_1	CA	…	1	0	2
HOBBIES_1_004_CA_1_evaluation	HOBBIES_1_004	HOBBIES_1	HOBBIES	CA_1	CA	…	1	1	0
HOBBIES_1_005_CA_1_evaluation	HOBBIES_1_005	HOBBIES_1	HOBBIES	CA_1	CA	…	0	0	0

1# define key columns, which will be keep for each rows.
2index_columns = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']
3# use "melt" method, columns starts with "d_", will be convert as "d" column, and related values will be as "sales" column.
4grid_df = pd.melt(train_df,
5                  id_vars=index_columns,
6                  var_name='d',
7                  value_name='sales')

2.4. Reduce memory by change the numberic type (Depend on Numpy)

 1def memory_optimizer(df, verbose=True):
 2    """
 3    Reduce memory by change the numberic type
 4
 5    Parameters
 6    ----------
 7    df : DataFrame(pandas)
 8        the DataFrame instance which will be perform memory reducing operation.
 9    verbose : bool, default = True
10        print verbose level log into default output or not, default is true
11    """
12
13    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
14    info_int8 = np.iinfo(np.int8)
15    info_int16 = np.iinfo(np.int16)
16    info_int32 = np.iinfo(np.int32)
17    info_float16 = np.finfo(np.float16)
18    info_float32 = np.finfo(np.float32)
19    megabytes = 1 << 20
20    origin_memory_usage = df.memory_usage().sum() / megabytes
21
22    for col in df.columns: 
23        ct = str(df[col].dtypes)
24        if ct in numerics:
25            col_min = df[col].min()
26            col_max = df[col].max()
27            if ct.startswith('int'):
28                if col_min > info_int8.min and col_max < info_int8.max:
29                    df[col] = df[col].astype(np.int8)
30                elif col_min > info_int16.min and col_max < info_int16.max:
31                    df[col] = df[col].astype(np.int16)
32                elif col_min > info_int32.min and col_max < info_int32.max:
33                    df[col] = df[col].astype(np.int32)
34                else:
35                    df[col] = df[col].astype(np.int64)
36            else:
37                if col_min > info_float16.min and col_max < info_float16.max:
38                    df[col] = df[col].astype(np.float16)
39                elif col_min > info_float32.min and col_max < info_float32.max:
40                    df[col] = df[col].astype(np.float32)
41                else:
42                    df[col] = df[col].astype(np.float64)
43    if verbose:
44        optimized_memory_usage = df.memory_usage().sum() / megabytes
45        print('Memory usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(optimized_memory_usage, 100 * (origin_memory_usage - optimized_memory_usage) / origin_memory_usage))
46    return df

作者|Author: RockSolid

发表日期|Publish Date: Jun 27, 2024

修改日期|Modified Date: Jun 27, 2024