python读取mongodb bson.gz文件

徐弘图
2023-12-01

需要用到 bson 库 (bson 是 pymongo 的依赖库,安装 pymongo 即可 )安装命令

pip install pymongo
import bson
import gzip
import re
import pandas as pd
from pandas import DataFrame

def load_mongo_bson_gz_file_all( path ):
    """
    if path mongo dump file  *.bson.gz  else raise Exception
    """
    if re.search( 'bson\.gz$', path ):
        with gzip.open(path, 'rb') as fp:
            data = bson.decode_all( fp.read() )
    else:
        raise Exception('Not mongo bson file!')
    return data


def load_mongo_bson_gz_file_iter(path, size =5000):
    """
    每次返回 5000 条 数据的数据迭代器
    5000 per time data iter
    """
    result = []
    if re.search( 'bson\.gz$', path ):
        with gzip.open( path, 'rb' ) as fp:
            bson_file_iter = bson.decode_file_iter(fp)
            while 1:
                result.clear()
                for i in range( size ):
                    try:
                        data = next( bson_file_iter )
                        result.append(data)
                    except StopIteration:
                        yield result
                        return
                yield result
    else:
        raise Exception('Not mongo bson file!')

def _read_mongo_bson_gz_file_iter( path ):
    for data in  load_mongo_bson_gz_file_iter( path ) :
        yield DataFrame( data )
def _read_mongo_bson_gz_file( path ):
    return DataFrame( data= load_mongo_bson_gz_file_all( path ) )

pd.read_mongo_iter = _read_mongo_bson_gz_file_iter
pd.read_mongo = _read_mongo_bson_gz_file
 类似资料: