GridFS是 MongoDB 提供的二进制数据存储
在数据库中的解决方案,对于 MongoDB 的 BSON 格式的数据(文档)存储有尺寸限制,最大为 16M。但是在实际系统开发中,上传的图片或者文件可能尺寸会很大,此时我们可以借用 GridFS 来管理这些文件。
GridFS 使用两个集合(collection)存储文件。一个集合是 chunks
, 用于存储文件内容的二进制数据;一个集合是 files
,用于存储文件的元数据。
GridFS 会将两个集合放在一个普通的 buket 中,并且这两个集合使用 buket 的名字作为前缀。MongoDB 的 GridFs 默认使用 fs 命名的 buket 存放两个文件集合。因此存储文件的两个集合分别会命名为集合 fs.files ,集合 fs.chunks。
当把一个文件存储到 GridFS 时,如果文件大于 chunksize (每个 chunk 块大小为 256KB),会先将文件按照 chunk 的大小分割成多个 chunk 块,最终将 chunk 块的信息存储在 fs.chunks 集合的多个文档中。然后将文件信息存储在 fs.files 集合的唯一一份文档中。其中 fs.chunks 集合中多个文档中的 file_id 字段对应 fs.files 集中文档”_id”字段。
// fs.files 集合中文档的存储内容:
{
"_id": <ObjectId>, // 文档 ID,唯一标识
"chunkSize": <num>, // chunk 大小 256kb
"uploadDate": <timetamp>, //文件上传时间
"length": <num>, // 文件长度
"md5": <string>, // 文件 md5 值
"filename": <string>, // 文件名
"contentType": <string>, // 文件的?MIME类型
"metadata": <dataObject> // 文件自定义信息
}
//fs.chunks 集合中文档的存储内容:
{
"_id": <ObjectId>, // 文档 ID,唯一标识
"files_id": <ObjectId>, // 对应 fs.files 文档的 ID
"n": <num>, // 序号,标识文件的第几个 chunk
"data": <binary> // 文件二级制数据
}
环境:MaxOSx 12.3.1
Mongo : 4.2.18
cd /usr/local/Cellar/mongodb-community@4.2/4.2.18
./bin/mongod --dbpath ./data/db
./bin/mongofiles put /data/111.txt
./bin/mongofiles get /data/111.txt -l 001.txt
./bin/mongofiles delete /data/111.txt
python3 -m pip install pymongo
import os
import gridfs
import pymongo
def get_db() -> pymongo.database.Database:
my_client = pymongo.MongoClient("mongodb://localhost:27017/")
my_db = my_client["mydb"]
return my_db
# https://www.osgeo.cn/mongo-python-driver/api/gridfs/index.html
def insertOne():
my_collection = get_db()["student"]
doc1 = {"name": "法外", "stuno": "2022211512", "class": "22物理1班"}
x = my_collection.insert_one(doc1)
print(x.inserted_id)
def gridFsInsert():
fs = gridfs.GridFS(get_db())
obj_id = fs.put(b"hello world", filename='helloworld')
print(obj_id)
print(fs.get(obj_id).read())
def gridFsInsert2(filePath: str):
db = pymongo.MongoClient("mongodb://localhost:27017/").mydb
fs = gridfs.GridFS(db)
query = {'filepath': filePath}
if fs.exists(query):
print('已经存在该文件')
else:
with open(filePath, 'rb') as fileObj:
bytes = fileObj.read()
obejctId = fs.put(bytes, filename=filePath.split(os.sep)[-1], filepath=filePath,
folder=filePath.split(os.sep)[-2])
print(obejctId)
# print(fs.get(obejctId).read())
def getFileByObjId(obejctId):
fs = gridfs.GridFS(get_db())
gridfsFile = fs.get(obejctId)
bytes = gridfsFile.read()
attri = {}
attri['md5'] = gridfsFile.md5
# print(attri)
newFileName = "../tmp/get_" + gridfsFile.filename.split(os.sep)[-1]
with open(newFileName, 'wb') as output:
output.write(bytes)
def gridFsGetFile(filename: str):
fs = gridfs.GridFS(get_db())
query = {'filename': filename}
obejctId = fs.find_one(query)._id
print(obejctId)
getFileByObjId(obejctId)
def gridFsGetFiles(folder: str):
fs = gridfs.GridFS(get_db())
query = {'folder': folder}
cursor = fs.find(query)
for c in cursor:
print(c._id)
getFileByObjId(c._id)
def listPath(path: str):
files = os.listdir(path)
for fileOne in files:
child = os.path.join(path, fileOne)
# print(child)
if os.path.isdir(child):
if child.split(os.sep)[-1].startswith('.'):
continue
listPath(child)
else:
if child.split(os.sep)[-1].startswith('.'):
continue
fileType = child.split(os.sep)[-1].split('.')[-1]
if fileType in ['html', 'js', 'css']:
# print(child)
# gridFsInsert2(child)
continue
else:
print(child)
gridFsInsert2(child)
def main():
print('pymongo is running...')
# insertOne()
# gridFsInsert()
# gridFsInsert2('/data/111.txt')
# gridFsGetFile('/data/111.txt')
# gridFsGetFiles('/data/')
# listPath('/data/')
print(os.sep)
if __name__ == '__main__':
main()
面向对象方式
import os
import gridfs
import pymongo
class MongoClientUtil(object):
def __init__(self, host_url: str = 'mongodb://localhost:27017/'):
# __ 开头表示为私有变量
self.__host_url = host_url
def get_db(self, db_name: str = 'test') -> pymongo.database.Database:
mongo_client = pymongo.MongoClient(self.__host_url)
db = mongo_client[db_name]
return db
def insert_doc(self, doc_name: str, doc: dict, db_name: str = 'test') -> object:
collection = self.get_db(db_name)[doc_name]
r = collection.insert_one(doc)
obj_id = r.inserted_id
return obj_id
def find_one(self, doc_name: str, query: str, db_name: str = 'test') -> object:
collection = self.get_db(db_name)[doc_name]
r = collection.find_one(query)
return r
class GridFsUtil(MongoClientUtil):
def getfs(self, db_name: str = 'test') -> gridfs.GridFS:
fs = gridfs.GridFS(self.get_db(db_name))
return fs
def upload_data(self, data: bytes, filename: str = ''):
fs = self.getfs()
object_id = fs.put(data, filename=filename)
# print(fs.get(object_id).read())
return object_id
def upload_file(self, file_path: str):
fs = self.getfs()
query = {'filepath': file_path}
if fs.exists(query):
# print('已经存在该文件')
raise ValueError('已经存在该文件:%s' % file_path)
else:
with open(file_path, 'rb') as fileObj:
data: bytes = fileObj.read()
object_id = fs.put(data, filename=file_path.split(os.sep)[-1],
filepath=file_path,
folder=file_path.split(os.sep)[-2])
return object_id
def get_data_by_object_id(self, object_id) -> bytes:
fs = self.getfs()
file = fs.get(object_id)
data = file.read()
return data
def find_by_query(self, query: str) -> list:
fs = self.getfs()
cursor = fs.find(query)
data = []
for e in cursor:
data.append(e._id)
# print(isinstance(str(e._id), str))
return data
def find_one_file(self, query: str) -> object:
fs = self.getfs()
r = fs.find_one(query)
return r._id
def delete(self, obj_id):
fs = self.getfs()
fs.delete(obj_id)
def save_file(data: bytes, file_name: str):
with open(file_name, 'wb') as output:
output.write(data)
##
# 就是动态语言的“鸭子类型”,它并不要求严格的继承体系,一个对象只要“看起来像鸭子,走起路来像鸭子”,那它就可以被看做是鸭子。
# Python的“file-like object“就是一种鸭子类型
##
def main():
print('service is running...')
util = GridFsUtil()
# print(isinstance(util, MongoClientUtil))
# obj_id = util.insert_doc("student", {"name": "赵天", "stuno": "2022211565", "class": "22物理1班"})
# r = util.find_one('student', {'stuno': '2022211565'})
# util.upload_data(b'hello gridFs ', 'file1')
# obj_id = util.upload_file('D:\\1111.txt')
# print(obj_id)
# data = util.get_data_by_object_id(obj_id)
# save_file(data, 'D:\\2222.txt')
data = util.find_by_query({'filename': '1111.txt'})
for oid in data:
print(oid)
util.delete(oid)
if __name__ == '__main__':
main()