Pandas大数据量内存优化
#头条创作挑战赛# 一、背景介绍
Pandas 在处理大数据(尤其是列比较多的场景)时,如果不做优化,内存占用还是很大的,下面通过一个实例来说明可以怎样优化
首先,生成一批 18 万的数据,每条数据 151 列import pandas as pd import numpy as np def gen_big_data(csv_file: str, big_data_count=90000): chars = "abcdefghijklmnopqrstuvwxyz" dates = pd.date_range(start="2020-01-01", periods=big_data_count, freq="30s") big_data_cols = ["Name"] for group in range(1, 31): big_data_cols.extend([f"date str {group}", f"bool {group}", f"int {group}", f"float {group}", f"str {group}"]) big_data = [] for i in range(0, big_data_count): row = [f"Name Item {(i + 1)}"] for _ in range(0, 30): row.extend([str(dates[i]), i % 2 == 0, np.random.randint(10000, 100000), 10000 * np.random.random(), chars[np.random.randint(0, 26)] * 15]) big_data.append(row) df = pd.DataFrame(data=big_data, columns=big_data_cols) df.to_csv(csv_file, index=None) if __name__ == "__main__": # 修改存放路径以及模拟数据量(默认 9 万) gen_big_data("./files/custom_big_data.csv", 180000) 复制代码
查看生成的数据格式,可以看到每一行有 151 列
保存数据后,先查看一下内存占用情况import pandas as pd def info_memory(csv_file: str): df = pd.read_csv(csv_file) print(df.info(memory_usage="deep")) if __name__ == "__main__": info_memory("./files/custom_big_data.csv") 复制代码
打印结果如下,可以看到当前内存占用为 862.1MB RangeIndex: 180000 entries, 0 to 179999 Columns: 151 entries, Name to str 30 dtypes: bool(30), float64(30), int64(30), object(61) memory usage: 862.1 MB 复制代码
查看不同类型的内存占用情况def info_memory_by_d_type(csv_file: str): df = pd.read_csv(csv_file) for d_type in ["bool", "float64", "int64", "object"]: d_type_selected = df.select_dtypes(include=[d_type]) mem_mean_bit = d_type_selected.memory_usage(deep=True).mean() mem_mean_mb = mem_mean_bit / 1024 ** 2 print(f"mean memory usage: {d_type:<7} - {mem_mean_mb:.3f} M") 复制代码
输出结果如下,其中 object 类型占用内存最多mean memory usage: bool - 0.166 M mean memory usage: float64 - 1.329 M mean memory usage: int64 - 1.329 M mean memory usage: object - 12.494 M 复制代码二、优化方案
查看某个类型的内存占用量def info_mem_usage_mb(pd_obj): if isinstance(pd_obj, pd.DataFrame): mem_usage = pd_obj.memory_usage(deep=True).sum() else: mem_usage = pd_obj.memory_usage(deep=True) # 转换为MB返回 return f"{mem_usage / 1024 ** 2:02.3f} MB" 复制代码
int 和 float 类型
对于 int 和 float 类型的数据,Pandas 加载到内存中的数据,默认是 int64 和 float64。一般场景下的数据,用 int32 和 float32 就足够了,用 numpy.iinfo 和 numpy.finfo 可以打印对应类型的取值范围Machine parameters for int32 --------------------------------------------------------------- min = -2147483648 max = 2147483647 --------------------------------------------------------------- Machine parameters for int64 --------------------------------------------------------------- min = -9223372036854775808 max = 9223372036854775807 --------------------------------------------------------------- 复制代码Machine parameters for float32 --------------------------------------------------------------- ... maxexp = 128 max = 3.4028235e+38 nexp = 8 min = -max --------------------------------------------------------------- Machine parameters for float64 --------------------------------------------------------------- ... maxexp = 1024 max = 1.7976931348623157e+308 nexp = 11 min = -max --------------------------------------------------------------- 复制代码
分别优化 int 和 float 的类型def optimize_int_and_float(): df_int = df.select_dtypes(include=["int64"]) df_int_converted = df_int.apply(pd.to_numeric, downcast="unsigned") df_float = df.select_dtypes(include=["float64"]) df_float_converted = df_float.apply(pd.to_numeric, downcast="float") print("int before ", info_mem_usage_mb(df_int)) print("int converted ", info_mem_usage_mb(df_int_converted)) print("float before ", info_mem_usage_mb(df_float)) print("float converted", info_mem_usage_mb(df_float_converted)) 复制代码
优化后的结果如下,内存减少 50% 左右int before 41.199 MB int converted 20.599 MB float before 41.199 MB float converted 20.599 MB 复制代码
object 类型中的普通 str 数据
获取 object 类型数据,并调用 describe() 展示统计信息
对于区分度较低的 str 1 到 str 30,一共只有 26 个可能的值,可以考虑转换为 Pandas 中的 categroy 类型,这里将区分度小于 40% 的列转换为 category 类型def optimize_obj(): df_obj = df.select_dtypes(include=["object"]) df_obj_converted = pd.DataFrame() for col in df_obj.columns: unique_count = len(df_obj[col].unique()) total_count = len(df_obj[col]) # 将区分度小于40%的列转换为category类型 if unique_count / total_count <= 0.4: df_obj_converted.loc[:, col] = df_obj[col].astype("category") else: df_obj_converted.loc[:, col] = df_obj[col] print("object before ", info_mem_usage_mb(df_obj)) print("object converted", info_mem_usage_mb(self.df_obj_converted)) 复制代码
执行结果如下,降低了 300+M 的内存object before 774.602 MB object converted 409.047 MB 复制代码
object 类型中的 date 数据def optimize_date_str(): df_date = pd.DataFrame() df_date_converted = pd.DataFrame() for col_name in df.columns: if col_name.startswith("date str"): df_date.loc[:, col_name] = df[col_name] df_date_converted.loc[:, col_name] = pd.to_datetime(df[col_name]) print("date before ", info_mem_usage_mb(df_date)) print("date converted", info_mem_usage_mb(df_date_converted)) 复制代码
执行结果如下,也降低了 300+M 的内存date before 391.388 MB date converted 41.199 MB 复制代码三、总体优化
综合以上的优化方法,并封装为类 PandasMemoryOptimizeDemoimport pandas as pd import numpy as np class PandasMemoryOptimizeDemo: df: pd.DataFrame df_int_converted: pd.DataFrame df_float_converted: pd.DataFrame df_obj_converted: pd.DataFrame df_date_converted: pd.DataFrame def __init__(self, csv_file: str): self.csv_file = csv_file self.df = pd.read_csv(self.csv_file) @staticmethod def info_mem_usage_mb(pd_obj): if isinstance(pd_obj, pd.DataFrame): mem_usage = pd_obj.memory_usage(deep=True).sum() else: mem_usage = pd_obj.memory_usage(deep=True) # 转换为 MB 返回 return f"{mem_usage / 1024 ** 2:02.3f} MB" def optimize_int_and_float(self): df_int = self.df.select_dtypes(include=["int64"]) self.df_int_converted = df_int.apply(pd.to_numeric, downcast="unsigned") df_float = self.df.select_dtypes(include=["float64"]) self.df_float_converted = df_float.apply(pd.to_numeric, downcast="float") print("int before ", self.info_mem_usage_mb(df_int)) print("int converted ", self.info_mem_usage_mb(self.df_int_converted)) print("float before ", self.info_mem_usage_mb(df_float)) print("float converted", self.info_mem_usage_mb(self.df_float_converted)) def optimize_obj(self): df_obj = self.df.select_dtypes(include=["object"]) self.df_obj_converted = pd.DataFrame() for col in df_obj.columns: unique_count = len(df_obj[col].unique()) total_count = len(df_obj[col]) # 将区分度小于 40% 的列转换为 category 类型 if unique_count / total_count <= 0.4: self.df_obj_converted.loc[:, col] = df_obj[col].astype("category") else: self.df_obj_converted.loc[:, col] = df_obj[col] print("object before ", self.info_mem_usage_mb(df_obj)) print("object converted", self.info_mem_usage_mb(self.df_obj_converted)) def optimize_date_str(self): df_date = pd.DataFrame() self.df_date_converted = pd.DataFrame() for col_name in self.df.columns: if col_name.startswith("date str"): df_date.loc[:, col_name] = self.df[col_name] self.df_date_converted.loc[:, col_name] = pd.to_datetime(self.df[col_name]) print("date before ", self.info_mem_usage_mb(df_date)) print("date converted", self.info_mem_usage_mb(self.df_date_converted)) def optimize_all(self): self.optimize_int_and_float() self.optimize_obj() self.optimize_date_str() df_converted = self.df.copy() df_converted[self.df_int_converted.columns] = self.df_int_converted df_converted[self.df_float_converted.columns] = self.df_float_converted df_converted[self.df_obj_converted.columns] = self.df_obj_converted df_converted[self.df_date_converted.columns] = self.df_date_converted print("before ", self.info_mem_usage_mb(self.df)) print("converted", self.info_mem_usage_mb(df_converted)) if __name__ == "__main__": optimize_demo = PandasMemoryOptimizeDemo("./files/custom_big_data.csv") optimize_demo.optimize_all() 复制代码
执行结果如下,优化效果还是很明显的before 862.149 MB converted 105.207 MB 复制代码四、直接优化 read_csv 方法
写代码的过程中,如果每次都按照这样的步骤,其实还是很繁琐,那能不能在调用 read_csv 方法时就进行优化呢?
接下来就一起来探索一下
在 PyCharm 中,点击 read_csv 进入源码,发现该方法提供了非常丰富的参数(50+),这里只列举需要的参数def read_csv( filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], # General Parsing Configuration dtype: DtypeArg | None = None, converters=None, # Datetime Handling parse_dates=None, infer_datetime_format=False, keep_date_col=False, date_parser=None, low_memory=_c_parser_defaults["low_memory"], memory_map=False, storage_options: StorageOptions = None, ): # locals() should never be modified kwds = locals().copy() 复制代码
可以直接指定 dtype 和 parse_dates,最终代码如下def big_data_optimized_read_csv(self): d_type_dict = {} date_indexes = [] for i in range(1, 31): d_type_dict[f"int {i}"] = "int32" d_type_dict[f"float {i}"] = "float32" d_type_dict[f"str {i}"] = "category" date_indexes.append(5 * (i - 1) + 1) self.df = pd.read_csv(self.csv_file, dtype=d_type_dict, parse_dates=date_indexes) print("optimized read_csv: ", self.info_mem_usage_mb(self.df)) 复制代码
执行结果如下,内存占用也大大降低了optimized read_csv: 105.207 MB