GridFS 笔记

丌官盛

2023-12-01

理论知识

GridFS是 MongoDB 提供的二进制数据存储在数据库中的解决方案，对于 MongoDB 的 BSON 格式的数据(文档)存储有尺寸限制，最大为 16M。但是在实际系统开发中，上传的图片或者文件可能尺寸会很大，此时我们可以借用 GridFS 来管理这些文件。

GridFS 使用两个集合（collection）存储文件。一个集合是 chunks, 用于存储文件内容的二进制数据；一个集合是 files，用于存储文件的元数据。
GridFS 会将两个集合放在一个普通的 buket 中，并且这两个集合使用 buket 的名字作为前缀。MongoDB 的 GridFs 默认使用 fs 命名的 buket 存放两个文件集合。因此存储文件的两个集合分别会命名为集合 fs.files ,集合 fs.chunks。

当把一个文件存储到 GridFS 时，如果文件大于 chunksize （每个 chunk 块大小为 256KB），会先将文件按照 chunk 的大小分割成多个 chunk 块，最终将 chunk 块的信息存储在 fs.chunks 集合的多个文档中。然后将文件信息存储在 fs.files 集合的唯一一份文档中。其中 fs.chunks 集合中多个文档中的 file_id 字段对应 fs.files 集中文档”_id”字段。

// fs.files 集合中文档的存储内容:
{
    "_id": <ObjectId>, // 文档 ID，唯一标识
    "chunkSize": <num>, // chunk 大小 256kb
    "uploadDate": <timetamp>, //文件上传时间 
    "length": <num>, // 文件长度
    "md5": <string>, // 文件 md5 值
    "filename": <string>, // 文件名
    "contentType": <string>, // 文件的?MIME类型
    "metadata": <dataObject> // 文件自定义信息
}

//fs.chunks 集合中文档的存储内容:
{
    "_id": <ObjectId>, // 文档 ID，唯一标识
    "files_id": <ObjectId>, // 对应 fs.files 文档的 ID
    "n": <num>, // 序号，标识文件的第几个 chunk
    "data": <binary> // 文件二级制数据
}

本机实践

环境：MaxOSx 12.3.1
Mongo : 4.2.18

启动服务

cd /usr/local/Cellar/mongodb-community@4.2/4.2.18
./bin/mongod --dbpath ./data/db

mongofiles

./bin/mongofiles put /data/111.txt

./bin/mongofiles get /data/111.txt -l 001.txt

./bin/mongofiles delete /data/111.txt

Python语言

python3 -m pip install pymongo

import os

import gridfs
import pymongo


def get_db() -> pymongo.database.Database:
    my_client = pymongo.MongoClient("mongodb://localhost:27017/")
    my_db = my_client["mydb"]
    return my_db


# https://www.osgeo.cn/mongo-python-driver/api/gridfs/index.html
def insertOne():
    my_collection = get_db()["student"]
    doc1 = {"name": "法外", "stuno": "2022211512", "class": "22物理1班"}
    x = my_collection.insert_one(doc1)
    print(x.inserted_id)


def gridFsInsert():
    fs = gridfs.GridFS(get_db())
    obj_id = fs.put(b"hello world", filename='helloworld')
    print(obj_id)
    print(fs.get(obj_id).read())


def gridFsInsert2(filePath: str):
    db = pymongo.MongoClient("mongodb://localhost:27017/").mydb
    fs = gridfs.GridFS(db)
    query = {'filepath': filePath}
    if fs.exists(query):
        print('已经存在该文件')
    else:
        with open(filePath, 'rb') as fileObj:
            bytes = fileObj.read()
            obejctId = fs.put(bytes, filename=filePath.split(os.sep)[-1], filepath=filePath,
                              folder=filePath.split(os.sep)[-2])
            print(obejctId)
            # print(fs.get(obejctId).read())


def getFileByObjId(obejctId):
    fs = gridfs.GridFS(get_db())
    gridfsFile = fs.get(obejctId)
    bytes = gridfsFile.read()
    attri = {}
    attri['md5'] = gridfsFile.md5
    # print(attri)
    newFileName = "../tmp/get_" + gridfsFile.filename.split(os.sep)[-1]
    with open(newFileName, 'wb') as output:
        output.write(bytes)


def gridFsGetFile(filename: str):
    fs = gridfs.GridFS(get_db())
    query = {'filename': filename}
    obejctId = fs.find_one(query)._id
    print(obejctId)
    getFileByObjId(obejctId)


def gridFsGetFiles(folder: str):
    fs = gridfs.GridFS(get_db())
    query = {'folder': folder}
    cursor = fs.find(query)
    for c in cursor:
        print(c._id)
        getFileByObjId(c._id)


def listPath(path: str):
    files = os.listdir(path)
    for fileOne in files:
        child = os.path.join(path, fileOne)
        # print(child)
        if os.path.isdir(child):
            if child.split(os.sep)[-1].startswith('.'):
                continue
            listPath(child)
        else:
            if child.split(os.sep)[-1].startswith('.'):
                continue
            fileType = child.split(os.sep)[-1].split('.')[-1]
            if fileType in ['html', 'js', 'css']:
                # print(child)
                # gridFsInsert2(child)
                continue
            else:
                print(child)
                gridFsInsert2(child)


def main():
    print('pymongo is running...')
    # insertOne()
    # gridFsInsert()
    # gridFsInsert2('/data/111.txt')
    # gridFsGetFile('/data/111.txt')
    # gridFsGetFiles('/data/')
    # listPath('/data/')
    print(os.sep)


if __name__ == '__main__':
    main()

面向对象方式

import os

import gridfs
import pymongo


class MongoClientUtil(object):

    def __init__(self, host_url: str = 'mongodb://localhost:27017/'):
        # __ 开头表示为私有变量
        self.__host_url = host_url

    def get_db(self, db_name: str = 'test') -> pymongo.database.Database:
        mongo_client = pymongo.MongoClient(self.__host_url)
        db = mongo_client[db_name]
        return db

    def insert_doc(self, doc_name: str, doc: dict, db_name: str = 'test') -> object:
        collection = self.get_db(db_name)[doc_name]
        r = collection.insert_one(doc)
        obj_id = r.inserted_id
        return obj_id

    def find_one(self, doc_name: str, query: str, db_name: str = 'test') -> object:
        collection = self.get_db(db_name)[doc_name]
        r = collection.find_one(query)
        return r


class GridFsUtil(MongoClientUtil):

    def getfs(self, db_name: str = 'test') -> gridfs.GridFS:
        fs = gridfs.GridFS(self.get_db(db_name))
        return fs

    def upload_data(self, data: bytes, filename: str = ''):
        fs = self.getfs()
        object_id = fs.put(data, filename=filename)
        # print(fs.get(object_id).read())
        return object_id

    def upload_file(self, file_path: str):
        fs = self.getfs()
        query = {'filepath': file_path}
        if fs.exists(query):
            # print('已经存在该文件')
            raise ValueError('已经存在该文件:%s' % file_path)
        else:
            with open(file_path, 'rb') as fileObj:
                data: bytes = fileObj.read()
                object_id = fs.put(data, filename=file_path.split(os.sep)[-1],
                                   filepath=file_path,
                                   folder=file_path.split(os.sep)[-2])
                return object_id

    def get_data_by_object_id(self, object_id) -> bytes:
        fs = self.getfs()
        file = fs.get(object_id)
        data = file.read()
        return data

    def find_by_query(self, query: str) -> list:
        fs = self.getfs()
        cursor = fs.find(query)
        data = []
        for e in cursor:
            data.append(e._id)
            # print(isinstance(str(e._id), str))
        return data

    def find_one_file(self, query: str) -> object:
        fs = self.getfs()
        r = fs.find_one(query)
        return r._id

    def delete(self, obj_id):
        fs = self.getfs()
        fs.delete(obj_id)


def save_file(data: bytes, file_name: str):
    with open(file_name, 'wb') as output:
        output.write(data)


##
# 就是动态语言的“鸭子类型”，它并不要求严格的继承体系，一个对象只要“看起来像鸭子，走起路来像鸭子”，那它就可以被看做是鸭子。
# Python的“file-like object“就是一种鸭子类型
##
def main():
    print('service is running...')
    util = GridFsUtil()
    # print(isinstance(util, MongoClientUtil))
    # obj_id = util.insert_doc("student", {"name": "赵天", "stuno": "2022211565", "class": "22物理1班"})
    # r = util.find_one('student', {'stuno': '2022211565'})
    # util.upload_data(b'hello gridFs ', 'file1')

    # obj_id = util.upload_file('D:\\1111.txt')
    # print(obj_id)
    # data = util.get_data_by_object_id(obj_id)
    # save_file(data, 'D:\\2222.txt')

    data = util.find_by_query({'filename': '1111.txt'})
    for oid in data:
        print(oid)
        util.delete(oid)


if __name__ == '__main__':
    main()

资料

https://www.mongodb.com/docs/manual/core/gridfs/
https://mongodb.github.io/mongo-java-driver/3.2/driver/reference/gridfs/
https://mongoing.com/archives/77493

GridFS 笔记

理论知识

本机实践

Python语言

资料

相关阅读

相关文章

相关问答

相关文档