Mongoengine基础教程

宁欣怿

2023-12-01

`Mongoengine`基础教程

安装

方法一：pip安装
- python -m pip install mongoengine
方法二：github源码安装
- git clone git://github.com/mongoengine/mongoengine
- cd mongoengine
- python setup.py install

连接

本地连接

from mongoengine import connect
connection('db')

URL连接

# 没有账号和密码
connection(host='mongodb://hostname:port/db')
# 带有账户和密码
connection(host='mongdodb://user:password@hostname:port/db')

指定参数连接

connection('db', host='host', post=port, authentication_source='admin/db')

多个数据库连接时，需要指定别名

connection(alias='user-db-alias', db='user-db')
connection(alias='book-db-alias', db='book-db')
connection(alias='users-books-db-alias', db='user-books-db')

class User(Document):
    name = StringField()
    
    meta = {'db_alias': 'user-db-alias'}
class Book(Document):
    name = StringField()
    
    meta = {'db_alias': 'book-db-alias'}
class AuthorBooks(Document):
    author = ReferenceField(User)
    book = ReferenceField(Book)
    
    meta = {'db_alias': 'users-books-alias'}

断开连接

disconnct(alias='db')  # 如果不加参数，断开默认连接

数据库切换

from mongoengine.context_managers import switch_collection
from mongoengine import *

class User(Document):
    name = StringField()
    meta = {'db_alias': 'user-db'}
    
with switch_collection(User, 'archive-user-db') as User:
    User(name='Ross').save()  # 数据保存在'archive-user-db'中User中

集合切换

from mongoengine.context_managers import switch_collection
from mongoengine import *

class Group(Document):
    name = StringField()
    
Group(name='test').save()  # 数据保存在Group集合中
with switch_collection(Group, 'group2000') as Group:
    Group(name='hello Group').save()  # 数据保存在group2000

文档定义

常规文档定义：文档根据其字段顺序进行序列化

from mongoengine import * 
import datetime

class Page(Document):
    title = StringField(max_length=200, require=True)
    date_modified = DatetimeField(default=datetime.datetime.utcnow())

动态文档定义：可以随意添加新的字段

from mongoengine import * 

class Page(DynamicDocument):
    title = StringField(max_length=200, required=True)

常规字段

BinaryField
BooleanField
DateTimeField
DictField
FloatField
Intfield
ListFields
EmbeddedDocumentField

ReferenceField

class User(Document):
    name = StringField()
    
class Page(Document):
    content = StringField()
    author = ReferenceField(User)

class Employee(Document):
    name = StringField()
    boss = ReferenceField('self')  # 自关联
    profile_page = ReferenceField('ProfilePage')  # 未完成的类
    
class ProfilePage(Doucment):
    content = StringField()

GenericReferenceField

class Line(Document):
    url = StringField()
    
class Post(Document):
    title = StringField()
    
class Bookmark(Document):
    bookmark_object = GenericReferenceField()
    
link = Link(url='http://hmarr.com')
link.save()

post = Post(title='Using MongoEngine')
post.save()

Bookmark(bookmark_object=link).save()
Bookmark(bookmark_object=post).save()

字段参数
- db_filed(default: None)：mongdo中字段名称
- required(default: False)：如果数据中没有这个字段，会报ValidationError
- default(default: None)：如果没有赋值，则使用默认值
- unique(default: False)：唯一值
- unique_with(default: None): 联合唯一
- primary_key(default: False): 主键
- choices(default: None): 限制字段的值范围
- validation(Optional):对该字段进行验证
  - ```
  def _not_empty(val):
      if not val:
          raise ValidationError('value can not be empty')
          
  class Person(Document):
      name = StringField(validation=_not_empty)
```
- **kwargs

元类

class Page(Document):
    category = IntField()
    rating = StringField(unique=True)
    title = StringField(max_length=200, required=True)
    created = DateTimeField()
    
    meta = {
        'allow_inheritance': True
        'ordering': ['-created']  # ordering
        'collection': 'cmsPage',  # collection name
        'max_documents': 1000,    # max documents
        'max_size': 200000,       # max size
        'indexes': [
            'title',              # single-field index
            '$title',             # text index
            '#title',             # hashed index
            ('title', '-rating'), # compound index
            ('category', '_cls'), # compound index
            {
                'field': ['created'],        
                'expireAfterSeconds': 3600  # ttl index
            }
        ]
    }

查询

过滤查询

# Return a QuerySet that will only iterate over users whose 'country' field is set # to 'uk'
uk_users = User.objects(country='uk')
# embedded document
uk_pages = Page.objects(author__country='uk')

查询操作符
- ```
# Only find users whose age is 18 or less
young_users = Users.objects(age__lte=18)
```
- ne - not equal to
- lt - less than
- lte - less than or equal to
- gt - greater than
- gte - greater than or equal to
- not - negate a standard check, may by used before other operators (e.g. Q(age__not__mod=(5, 0)))
- in - value is in list (a list of values should be provided)
- nin - value is not in list (a list of values should be provided)
- mod - value % x == y, where x and y are two provided values
- all - every item in list of values provided is in array
- size - the size of the array is
- exists - value for field exists
- 字符串查询
  - exact - string field exactly matches value
  - iexact - string field exactly matches value (case insensitive)
  - contains - string field contains value
  - icontains - string field contains value (case insensitive)
  - startswith - string field starts with value
  - istartswith - string field starts with value (case insensitive)
  - endswith - string field ends with value
  - iendswith - string field ends with value (case insensitive)
  - wholeword - string field contains whole word
  - iwholeword - string field contains whole word (case insensitive)
  - regex - string field match by regex
  - iregex - string field match by regex (case insensitive)
  - match - performs an $elemMatch so you can match an entire document within an array
- 列表查询
  - ```
  class Page(Document):
      tags = ListField(StringField)
      
  # This will match all pages that have the word 'coding' as an item in the
  # 'tags' list
  Page.objects(tags='coding')
  # query by position
  Page.objects(tags__0 = 'db')
  # fetch part of a list, skip 5, limit 10
  Page.objects.fields(slice__comments=[5, 10])
```
- 原始查询
  - ```
  Page.objects(__raw__={'tags': 'coding'})
```

排序

# ASC date
blogs = BlogPost.objects().order_by('date')
# ASC date, DESC title
blogs = BlogPost.objects().order_by('+date', '-title')

限制和跳过结果
- 方法一: 列表切片（建议使用）
  - ```
  users = User.objects[10:15]
```
- 方法二: skip + limit
  - ```
  users = User.objects.skip(10).limit(5)
```
- 有且仅有一个结果时，建议使用.first()获取结果

默认文档查询

修改objects方法返回结果

class BlogPost(Document):
    title = StringField()
    date = DateTimeField()
    
    @queryset_manager
    def objects(doc_cls, queryset):
        return queryset.order_by('-date')

自定义管理器方法

class BlogPost(Document):
    title = StringField()
    published = BooleanField()
    
    @queryset_manager
    def live_posts(doc_cls, queryset):
        return queryset.filter(published=True)
    
BlogPost(title='test1', published=False).save()
BlogPost(title='test2', published=True).save()
assert len(BlogPost.objects) == 2  # True
assert len(BlogPost.live_posts()) == 1  # True

自定义查询集

class AwesomerQuerySet(QuerySet):
    def get_awesome(self):
        return self.filter(awesome=True)
    
class Page(Document):
    meta = {'queryset_class': AwesomerQuerySet}
    
Page.objects.get_awesome()

聚合

.count():计数结果
- ```
num_users = User.objects.count()
```
.sum():字段值求和，如果不存在，则忽略
- ```
yearly_expense = Employee.objects.sum('salary')
```
average():字段平均值
- ```
mean_age = User.objects.average('age')
```

.item_frequencies():字段频率

class Article(Document):
    tag = ListField(StringField())
    
tag_freqs = Article.objects.item_frequencies('tag', normalize=True)

from operator import itemgetter
top_tags = sorted(tag_freqs.items(), key=itemgetter(1), reverse=True)[:10]

MongoDB aggregation

class Person(Document):
    name = StringField()
    
Person(name='John').save()
Person(name='Bob').save()

pipeline = [
    {'$sort': {'name': -1}},
    {'$project': {'_id': 0, 'name': {'$toUpper': '$name'}}}
]
data = Person.objects().aggregate(pipeline)
assert data = [{'name': 'BOB'}, {'name': 'JOHN'}]

查询效率和性能优化

查询字段的子集

class Film(Document):
    title = StringField()
    year = IntField()
    rating = IntField(default=3)
    
Film(title='The Shawshank Redemption', year=1994, rating).save()
f = Film.objects.only('title').first()
f.title  # 'The Shawshank Redemption'
f.year   # None
f.rating  # 3, default value

exclude()和only()作用相反；
后期如果需要丢失字段，直接调用reload()方法

获取关联查询
- 使用select_related(max_depth=1)

关闭非关联查询

post = Post.objects.no_dereference().first()
assert(isinstance(post.author, DBRef))  # True

高级查询

from mongoengine.queryset.visitor import Q

# Get published posts
Post.objects(Q(published=True) | Q(published_date__lte=datetime.now()))

# Get top posts
Post.objects((Q(featured=True) & (Q(hits__gte=1000)) | Q(hits__gte=5000)))

文档验证

内置验证：调用.validate()或者.save()方法时验证

from mongoengine import Document, EmailField

class User(Document):
    email = EmailField()
    age = IntField(min_value=0, max_value=99)
    
user = User(email='invalid@', age=24)
user.validate()  # raise ValidationError (Invalid email address:['email'])
user.save()  # raise ValidationError (Invalid email address:['email'])

user2 = User(email='john..doe@garbage.com', age=1000)
user2.save()  # raise ValidationError (Integer value is too large: ['age'])

自定义验证

def not_john_doe(name):
    if name == 'John Doe':
        raise ValidationError('John Doe is not a valid name')
        
class Person(Document):
    full_name = StringField(validation=not_john_doe)
    
Person(full_name='Billy Doe').save()
Person(full_name='John Doe').save()  # raise ValidaionError (John Doe is not  ...)

clean方法：提供自定义模型验证和/或在验证之前修改某些字段值.当验证开启，调用save()方法时调用

class Essay(Document):
    status = StringField(choices=('Published', 'Draft'), required=True)
    pub_date = DateTimeField()
    
    def clean(self):
        # Validate that only published essays have a 'pub_date'
        if self.status == 'Draft' and self.pub_date is not None:
            raise ValidationError('Dreaft entries should not ...')
        # Set the pub_date for published items if not set
        if self.status == 'Published' and self.pub_date is None:
            self.pub_date = datetime.now()

自定义字段

class AgeField(IntField):
    def validate(self, value):
        super(AgeField, self).validate(value)  #let IntField.validate run first
        if value == 60:
            self.error('60 is not allowed')
            
class Person(Document):
    age = AgeField(min_value=0, max_value=99)
    
Person(age=20).save()  # passes
Person(age=1000).save()  
# raise ValidationError(Integer value is too large['age'])
Person(age=60).save()   
# raises ValidationError (Person:None) (60 is not allowed: ['age'])

跳过验证

class Person(Document):
    age = IntField(max_value=100)
    
Person(age=1000).save(validate=False)

`GridFS`

GridFS用于存储和恢复那些超过16M(BSON文件限制)的文件（如：图片，音频，视频等）。GridFS同时也是文件存储的一种方式，但它是存储在MongoDB的集合中。
GridFS会将大文件对象分割成多个小的chunk（文件片段），一般为256k/个，每个chunk将作为MongoDB的一个文档被存储在chunks集合中。
GridFS用两个集合来存储一个文件：fs.files和fs.chunks。文件实际内容存储在.chunks中，和文件有关的meta数据将会被存在.files集合中。

写入

class Animal(Document):
    genus = StringField()
    family = StringField()
    photo = FileField()

marmot = Animal(genus='Marmota', family='Sciuridae')
with open('marmot.jpg', 'rb') as fd:
    marmot.photo.put(fd, content_type='image/jpeg')
marmot.save()

查询

marmot = Animal.objects(genus='Marmota').first()
photo = marmot.photo.read()
content_type = marmot.photo.content_type

# If you need to read the content of a file multipe times, you'll need to 'rewind' the file-like object
marmot = Animal.objects(genus='Marmota').first()
content1 = marmot.photo.read()
assert content1 != ''

content2 = marmot.photo.read()  # will be empty
assert content2 == ''

marmot.photo.seek(0)  # rewind the file by setting the current position
content3 = marmot.photo.read()
assert content3 == content1

流式操作

marmot.photo.new_file()
marmot.photo.write('some_image_data')
marmot.photo.write('some_more_image_data')
marmot.photo.close()

marmot.close()

删除

marmot.photo.delete()  # Deletes the GridFS document
# Saves the GridFS reference (being None) contained in the marmot instance
marmot.save()

文件替换

another_marmot = open('another_marmot.png', 'rb')
# Replaces the GridFS document
marmot.photo.replace(another_marmot, content_type='image/png')  
# Replaces the GridFS reference contained in marmot instance
marmot.save()

`Signal`

可用signals主要包括:
- pre_init
  - 在创建新 Document 或EmbeddedDocument 实例期间调用，在收集构造函数参数之后但在对它们进行任何其他处理之前（即默认值的分配）。此信号的处理程序使用 values 关键字参数传递参数字典，并且可以在返回之前修改此字典。
- post_init
  - 在新Document或EmbeddedDocument实例的所有处理完成后调用。
- pre_save
  - 在执行任何操作之前在save()中调用。
- pre_save_post_validation
  - 在验证发生后但在保存之前在 save() 中调用。
- post_save
  - 在大多数操作（验证、插入/更新和级联，但不清除脏标志）成功完成后在 save() 中调用。传递了创建的附加布尔关键字参数，以指示保存是插入还是更新
- pre_delete
  - 在尝试删除操作之前在 delete() 中调用
- post_delete
  - 成功删除记录后在 delete() 内调用
- pre_bulk_insert
  - 在验证要插入的文档之后调用，但在写入任何数据之前调用。在这种情况下，文档参数被一个表示正在插入的文档列表的文档参数替换。
- post_bulk_insert
  - 在批量插入操作成功后调用。根据 pre_bulk_insert，文档参数被省略并替换为文档参数。加载的附加布尔参数将文档的内容标识为 True 时的Document实例，或者仅标识 False 时插入记录的主键值列表。

绑定文档

方法一：

package:1
import logging
from datetime import datetime

from mongoengine import * 
from mongoengine import signals

def update_modified(sender, document):
    document.modified = datetime.utcnow()
    
package:2
class Record(Document):
    modified = DateTimeField()
    
signals.pre_save.connect(update_modified)

方法二:EmbeddedDocument仅支持pre/post_init信号.

class Author(Document):
    name = StringField()
    
    @classmethod
    def pre_save(cls, sender, document, **kwargs):
        logging.debug('Pre Save: %s' % document.name)
        
    @classmethod
    def post_save(cls, sender, document, **kwargs):
        logging.debug('Post Save: %s' % document.name)
        if 'created' in kwargs:
            logging.debug('Created')
        else:
            logging.debug('Updated')
            
signals.pre_save.connect(Author.pre_save, sender=Author)
sigmals.post_save.connect(Author.post_save, sender=Author)

方法三：

def handler(event):
    def decorator(fn):
        def apply(cls):
            event.connect(fn, sender=cls)
            return cls
        fn.apply = apply
        return fn
    return decorator

@handler(signals.pre_save)
def update_modified(sender, document):
    document.modified = datetime.utcnow()
    
@update_modified.apply
class Record(Document):
    modified = DateTimeField()

Text Search

定义全文检索: 使用 $ 前缀设置文本索引。

class News(Document):
    title = StringField()
    content = StringField()
    is_active = BooleanField()
    
    meta = {'indexes': [
        {'fields': ['$title', '$content'],
         'default_language': 'english',
         'weights': {'title': 10, 'content': 20}
        }
    ]}

全文检索使用

News(title='Using mongodb text search', content='Testing text search').save()
News(title='MongoEngine 0.9 released', content='Various improvements').save()

document = News.objects.search_text('testing').first()
document.title  # may be: 'Using mongodb text search'

document = News.objects.search_text('released').first()
document.title  # may be: 'MongoEngine 0.9 released'

使用文本权重排序

obj = News.objects.search_text('mongo').order_by('$text_score')

参考文献

mongoengine官方文档

Mongoengine基础教程