需要用到 bson 库 (bson 是 pymongo 的依赖库,安装 pymongo 即可 )安装命令
pip install pymongo
import bson
import gzip
import re
import pandas as pd
from pandas import DataFrame
def load_mongo_bson_gz_file_all( path ):
"""
if path mongo dump file *.bson.gz else raise Exception
"""
if re.search( 'bson\.gz$', path ):
with gzip.open(path, 'rb') as fp:
data = bson.decode_all( fp.read() )
else:
raise Exception('Not mongo bson file!')
return data
def load_mongo_bson_gz_file_iter(path, size =5000):
"""
每次返回 5000 条 数据的数据迭代器
5000 per time data iter
"""
result = []
if re.search( 'bson\.gz$', path ):
with gzip.open( path, 'rb' ) as fp:
bson_file_iter = bson.decode_file_iter(fp)
while 1:
result.clear()
for i in range( size ):
try:
data = next( bson_file_iter )
result.append(data)
except StopIteration:
yield result
return
yield result
else:
raise Exception('Not mongo bson file!')
def _read_mongo_bson_gz_file_iter( path ):
for data in load_mongo_bson_gz_file_iter( path ) :
yield DataFrame( data )
def _read_mongo_bson_gz_file( path ):
return DataFrame( data= load_mongo_bson_gz_file_all( path ) )
pd.read_mongo_iter = _read_mongo_bson_gz_file_iter
pd.read_mongo = _read_mongo_bson_gz_file