自动将嵌套json转换为pd.DataFrame的小函数

发布于 2022-09-01  499 次阅读


使用递归的方法自动分析嵌套json的结构,获得pd.json_normalize函数中的record_path和meta参数,然后将嵌套json转为pd.DataFrame。


class json_nor():

    def __init__(self, data=None):
        """初始化

        Args:
            data (json, optional): json文件. Defaults to None.
        """
        self.data         = data
        self.arg_record   = []
        self.arg_meta     = []
        self.nor_data     = None

    @staticmethod
    def get_dict_allkeys(value,key=None,arg_record=None,arg_meta=None):
        """获得json_normalize函数的record_path和meta参数

        Args:
            value (json)                  : dict/json
            key (list, optional)          : key值列表. Defaults to None.
            arg_record (list, optional)   : recore_path参数. Defaults to None.
            arg_meta (list, optional)     : meta参数. Defaults to None.
        """
        if key is None:
            key = []

        if isinstance(value, dict):  # 使用isinstance检测数据类型
            for key_sub,value_sub in value.items():
                l = []
                l.extend(key)
                l.append(key_sub)
                json_nor.get_dict_allkeys(value_sub,key=l,arg_record=arg_record,arg_meta=arg_meta)  # 自我调用实现无限遍历
        elif isinstance(value, list):
            arg_record.append(key[0])
        else:
            if len(key) == 1:
                arg_meta.append(key[0])
            else:
                arg_meta.append(key)

    def run(self,meta_prefix='meta',sep='->',arg_data=None,nor_data=None,errors = 'raise'):
        """运行程序入口

        Args:
            meta_prefix (str, optional)   : meta前缀. Defaults to 'meta'.
            sep (str, optional)           : 分隔符. Defaults to '->'.
            agg_data (dict, optional)     : 用来获得参数的数据. Defaults to None.
            nor_data (dict, optional)     : 用来转换的数据. Defaults to None.
            errors (str, optional)        : 'raise' or 'ignore'. Defaults to 'raise'.
        Returns:
            DataFrame: 转换后数据
        """
        if arg_data is None:
            arg_data = self.data
        if nor_data is None:
            nor_data = self.data
        json_nor.get_dict_allkeys(arg_data,arg_record=self.arg_record,arg_meta=self.arg_meta)
        if len(self.arg_record) == 1:
            self.nor_data = pd.json_normalize(nor_data,record_path=self.arg_record,meta=self.arg_meta,record_prefix=f'{self.arg_record[0]}->',meta_prefix=f'{meta_prefix}{sep}',sep=sep,errors=errors)
        elif len(self.arg_record) >1:
            self.nor_data = pd.json_normalize(nor_data,record_path=self.arg_record[0],meta=self.arg_meta,record_prefix=f'{self.arg_record[0]}->',meta_prefix=f'{meta_prefix}{sep}',sep=sep,errors=errors)
            for i in range(1,len(self.arg_record)):
                df_temp = pd.json_normalize(nor_data,record_path=self.arg_record[i],record_prefix=f'{self.arg_record[i]}->',sep=sep,errors=errors)
                self.nor_data = pd.concat([df_temp,self.nor_data],axis=1)
        else:
            self.nor_data = pd.json_normalize(nor_data,sep=sep,errors=errors)

        return self.nor_data

    def get_arg(self):
        """获得record_path和meta参数

        Returns:
            tuple: (arg_record,arg_meta)
        """
        json_nor.get_dict_allkeys(self.data[0],arg_record=self.arg_record,arg_meta=self.arg_meta)

        return self.arg_record,self.arg_meta