参考 https://developer.aliyun.com/article/643754#comment
分析 RocksDB 合适以及如何 Flush 内存数据(MemTable)到 SST的。在 RocksDB 中,每一个 column family 都有自己的 MemTable,当它超过固定大小时,会被设置为 immutable 然后会有后台线程启动来刷新这个 immutable memtable 到磁盘。
–write_buffer_size 表示每个column family的memtable的大小限制
–db_write_buffer_size 总的memtable的大小限制(所有的ColumnFamily).
–max_write_buffer_number 最大的memtable的个数
–min_write_buffer_number_to_merge 表示最小的可以被flush的memtable的个数
触发 Flush 的条件
在下面这几种条件下RocksDB会flush memtable到磁盘.
源码分析
在 DBImpl 中包含一个 compaction_queue_ 队列。保存所有将要被 flush 到磁盘的 column family。只有当当前的 column family 满足 flush 的条件。(cfd->imm()->IsFlushPending()) 才会将此 CF 加入到 flush 队列
class DBImpl : public DB {
...
std::deque<FlushRequest> flush_queue_;
std::deque<ColumnFamilyData*> compaction_queue_;
...
}
void DBImpl::SchedulePendingFlush(const FlushRequest& flush_req,
FlushReason flush_reason) {
...
if (!cfd->queued_for_flush() && cfd->imm()->IsFlushPending()) {
cfd->Ref();
cfd->set_queued_for_flush(true);
cfd->SetFlushReason(flush_reason);
++unscheduled_flushes_;
flush_queue_.push_back(flush_req);
}
...
}
而刷新 MemTable 到磁盘是一个后台线程来做的,这个后台线程叫做 BGWorkFlush
void DBImpl::BGWorkFlush(void* arg) {
FlushThreadArg fta = *(reinterpret_cast<FlushThreadArg*>(arg));
delete reinterpret_cast<FlushThreadArg*>(arg);
IOSTATS_SET_THREAD_POOL_ID(fta.thread_pri_);
TEST_SYNC_POINT("DBImpl::BGWorkFlush");
static_cast_with_check<DBImpl>(fta.db_)->BackgroundCallFlush(fta.thread_pri_);
TEST_SYNC_POINT("DBImpl::BGWorkFlush:done");
}
void DBImpl::BackgroundCallFlush(Env::Priority thread_pri) {
...
Status s = BackgroundFlush(&made_progress, &job_context, &log_buffer,
&reason, thread_pri);
...
}
最终会调用 BackgroundFlush 函数,而这个函数的主要功能是在 flush_req 中找到一个 column family 然后刷新它的 memtable 到磁盘。
Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context,
LogBuffer* log_buffer, FlushReason* reason,
Env::Priority thread_pri) {
...
for (const auto& iter : flush_req) {
ColumnFamilyData* cfd = iter.first;
if (immutable_db_options_.experimental_mempurge_threshold > 0.0) {
// If imm() contains silent memtables,
// requesting a flush will mark the imm_needed as true.
cfd->imm()->FlushRequested();
}
if (cfd->IsDropped() || !cfd->imm()->IsFlushPending()) {
// can't flush this CF, try next one
column_families_not_to_flush.push_back(cfd);
continue;
}
superversion_contexts.emplace_back(SuperVersionContext(true));
bg_flush_args.emplace_back(cfd, iter.second,
&(superversion_contexts.back()));
}
if (!bg_flush_args.empty()) {
...
status = FlushMemTablesToOutputFiles(bg_flush_args, made_progress,
job_context, log_buffer, thread_pri);
TEST_SYNC_POINT("DBImpl::BackgroundFlush:BeforeFlush");
// All the CFDs in the FlushReq must have the same flush reason, so just
// grab the first one
*reason = bg_flush_args[0].cfd_->GetFlushReason();
...
}
}
最终会调用 FlushMemTableToOutputFile 来刷新 MemTable 到磁盘。在 RocksDB 中刷新是通过 FlushJob 这个类来实现的,最终会调用 WriteLevel0Table来刷新内容到磁盘。
Status DBImpl::FlushMemTablesToOutputFiles(
const autovector<BGFlushArg>& bg_flush_args, bool* made_progress,
JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri) {
...
Status s = FlushMemTableToOutputFile(
cfd, mutable_cf_options, made_progress, job_context, superversion_context,
snapshot_seqs, earliest_write_conflict_snapshot, snapshot_checker,
log_buffer, thread_pri);
...
}
Status DBImpl::FlushMemTableToOutputFile(
ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
bool* made_progress, JobContext* job_context,
SuperVersionContext* superversion_context,
std::vector<SequenceNumber>& snapshot_seqs,
SequenceNumber earliest_write_conflict_snapshot,
SnapshotChecker* snapshot_checker, LogBuffer* log_buffer,
Env::Priority thread_pri) {
...
FlushJob flush_job(
dbname_, cfd, immutable_db_options_, mutable_cf_options, max_memtable_id,
file_options_for_compaction_, versions_.get(), &mutex_, &shutting_down_,
snapshot_seqs, earliest_write_conflict_snapshot, snapshot_checker,
job_context, log_buffer, directories_.GetDbDir(), GetDataDir(cfd, 0U),
GetCompressionFlush(*cfd->ioptions(), mutable_cf_options), stats_,
&event_logger_, mutable_cf_options.report_bg_io_stats,
true /* sync_output_directory */, true /* write_manifest */, thread_pri,
io_tracer_, db_id_, db_session_id_, cfd->GetFullHistoryTsLow(),
&blob_callback_);
...
if (s.ok()) {
s = flush_job.Run(&logs_with_prep_tracker_, &file_meta,
&switched_to_mempurge);
need_cancel = false;
}
...
}
FlushJob::Run 具体 SSTable 的格式可以查看
https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format?spm=a2c6h.12873639.0.0.3d09e5b9PSdUx7
Status FlushJob::Run(LogsWithPrepTracker* prep_tracker, FileMetaData* file_meta,
bool* switched_to_mempurge) {
...
Status s;
if (mempurge_s.ok()) {
base_->Unref();
s = Status::OK();
} else {
// This will release and re-acquire the mutex.
s = WriteLevel0Table();
}
...
}
参考 https://www.jianshu.com/p/38a38134491b
FluhshJob::WriteLevel0Table 将 FlushJob 中挑选出来的所有 MemTable 进行 Merge 然后构造SSTable 并写到 L0,调用BuildTable函数构造SSTable。
Status FlushJob::WriteLevel0Table() {
...
{
auto write_hint = cfd_->CalculateSSTWriteHint(0); //计算生命周期,因为是level 0,所以传入 level = 0
db_mutex_->Unlock();
if (log_buffer_) {
log_buffer_->FlushBufferToLog();
}
// memtables and range_del_iters store internal iterators over each data
// memtable and its associated range deletion memtable, respectively, at
// corresponding indexes.
std::vector<InternalIterator*> memtables;
std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
range_del_iters;
ReadOptions ro;
ro.total_order_seek = true;
Arena arena;
uint64_t total_num_entries = 0, total_num_deletes = 0;
uint64_t total_data_size = 0;
size_t total_memory_usage = 0;
// 遍历所有的 MemTable,并获取每个 MemTable 的 iterator
for (MemTable* m : mems_) {
ROCKS_LOG_INFO(
db_options_.info_log,
"[%s] [JOB %d] Flushing memtable with next log file: %" PRIu64 "\n",
cfd_->GetName().c_str(), job_context_->job_id, m->GetNextLogNumber());
memtables.push_back(m->NewIterator(ro, &arena));
// 构造 MemTable 的 RangeTombstoneIterator
auto* range_del_iter =
m->NewRangeTombstoneIterator(ro, kMaxSequenceNumber);
if (range_del_iter != nullptr) {
range_del_iters.emplace_back(range_del_iter);
}
total_num_entries += m->num_entries();
total_num_deletes += m->num_deletes();
total_data_size += m->get_data_size();
total_memory_usage += m->ApproximateMemoryUsage();
}
event_logger_->Log() << "job" << job_context_->job_id << "event"
<< "flush_started"
<< "num_memtables" << mems_.size() << "num_entries"
<< total_num_entries << "num_deletes"
<< total_num_deletes << "total_data_size"
<< total_data_size << "memory_usage"
<< total_memory_usage << "flush_reason"
<< GetFlushReasonString(cfd_->GetFlushReason());
{
ScopedArenaIterator iter(
NewMergingIterator(&cfd_->internal_comparator(), memtables.data(),
static_cast<int>(memtables.size()), &arena));
ROCKS_LOG_INFO(db_options_.info_log,
"[%s] [JOB %d] Level-0 flush table #%" PRIu64 ": started",
cfd_->GetName().c_str(), job_context_->job_id,
meta_.fd.GetNumber());
TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table:output_compression",
&output_compression_);
int64_t _current_time = 0;
auto status = clock_->GetCurrentTime(&_current_time);
// Safe to proceed even if GetCurrentTime fails. So, log and proceed.
if (!status.ok()) {
ROCKS_LOG_WARN(
db_options_.info_log,
"Failed to get current time to populate creation_time property. "
"Status: %s",
status.ToString().c_str());
}
const uint64_t current_time = static_cast<uint64_t>(_current_time);
uint64_t oldest_key_time =
mems_.front()->ApproximateOldestKeyTime();
// It's not clear whether oldest_key_time is always available. In case
// it is not available, use current_time.
uint64_t oldest_ancester_time = std::min(current_time, oldest_key_time);
TEST_SYNC_POINT_CALLBACK(
"FlushJob::WriteLevel0Table:oldest_ancester_time",
&oldest_ancester_time);
meta_.oldest_ancester_time = oldest_ancester_time;
meta_.file_creation_time = current_time;
uint64_t creation_time = (cfd_->ioptions()->compaction_style ==
CompactionStyle::kCompactionStyleFIFO)
? current_time
: meta_.oldest_ancester_time;
uint64_t num_input_entries = 0;
uint64_t memtable_payload_bytes = 0;
uint64_t memtable_garbage_bytes = 0;
IOStatus io_s;
const std::string* const full_history_ts_low =
(full_history_ts_low_.empty()) ? nullptr : &full_history_ts_low_;
// tboptions 获得了 cfd 的 ID,又传入了 BuildTable, cfd 的 ID 在 TBoptions
TableBuilderOptions tboptions(
*cfd_->ioptions(), mutable_cf_options_, cfd_->internal_comparator(),
cfd_->int_tbl_prop_collector_factories(), output_compression_,
mutable_cf_options_.compression_opts, cfd_->GetID(), cfd_->GetName(),
0 /* level */, false /* is_bottommost */,
TableFileCreationReason::kFlush, creation_time, oldest_key_time,
current_time, db_id_, db_session_id_, 0 /* target_file_size */,
meta_.fd.GetNumber());
// 调用 BuildTable 函数构造 SSTable
s = BuildTable(
dbname_, versions_, db_options_, tboptions, file_options_,
cfd_->table_cache(), iter.get(), std::move(range_del_iters), &meta_,
&blob_file_additions, existing_snapshots_,
earliest_write_conflict_snapshot_, snapshot_checker_,
mutable_cf_options_.paranoid_file_checks, cfd_->internal_stats(),
&io_s, io_tracer_, BlobFileCreationReason::kFlush, event_logger_,
job_context_->job_id, Env::IO_HIGH, &table_properties_, write_hint,
full_history_ts_low, blob_callback_, &num_input_entries,
&memtable_payload_bytes, &memtable_garbage_bytes);
if (!io_s.ok()) {
io_status_ = io_s;
}
if (num_input_entries != total_num_entries && s.ok()) {
std::string msg = "Expected " + ToString(total_num_entries) +
" entries in memtables, but read " +
ToString(num_input_entries);
ROCKS_LOG_WARN(db_options_.info_log, "[%s] [JOB %d] Level-0 flush %s",
cfd_->GetName().c_str(), job_context_->job_id,
msg.c_str());
if (db_options_.flush_verify_memtable_count) {
s = Status::Corruption(msg);
}
}
if (tboptions.reason == TableFileCreationReason::kFlush) {
TEST_SYNC_POINT("DBImpl::FlushJob:Flush");
RecordTick(stats_, MEMTABLE_PAYLOAD_BYTES_AT_FLUSH,
memtable_payload_bytes);
RecordTick(stats_, MEMTABLE_GARBAGE_BYTES_AT_FLUSH,
memtable_garbage_bytes);
}
LogFlush(db_options_.info_log);
}
ROCKS_LOG_INFO(db_options_.info_log,
"[%s] [JOB %d] Level-0 flush table #%" PRIu64 ": %" PRIu64
" bytes %s"
"%s",
cfd_->GetName().c_str(), job_context_->job_id,
meta_.fd.GetNumber(), meta_.fd.GetFileSize(),
s.ToString().c_str(),
meta_.marked_for_compaction ? " (needs compaction)" : "");
if (s.ok() && output_file_directory_ != nullptr && sync_output_directory_) {
s = output_file_directory_->FsyncWithDirOptions(
IOOptions(), nullptr,
DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
}
TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table", &mems_);
db_mutex_->Lock();
}
base_->Unref();
// Note that if file_size is zero, the file has been deleted and
// should not be added to the manifest.
const bool has_output = meta_.fd.GetFileSize() > 0;
if (s.ok() && has_output) {
TEST_SYNC_POINT("DBImpl::FlushJob:SSTFileCreated");
// if we have more than 1 background thread, then we cannot
// insert files directly into higher levels because some other
// threads could be concurrently producing compacted files for
// that key range.
// Add file to L0
// 将生成的文件添加到L0
edit_->AddFile(0 /* level */, meta_.fd.GetNumber(), meta_.fd.GetPathId(),
meta_.fd.GetFileSize(), meta_.smallest, meta_.largest,
meta_.fd.smallest_seqno, meta_.fd.largest_seqno,
meta_.marked_for_compaction, meta_.oldest_blob_file_number,
meta_.oldest_ancester_time, meta_.file_creation_time,
meta_.file_checksum, meta_.file_checksum_func_name,
meta_.min_timestamp, meta_.max_timestamp);
edit_->SetBlobFileAdditions(std::move(blob_file_additions));
}
#ifndef ROCKSDB_LITE
// Piggyback FlushJobInfo on the first first flushed memtable.
mems_[0]->SetFlushJobInfo(GetFlushJobInfo());
#endif // !ROCKSDB_LITE
// Note that here we treat flush as level 0 compaction in internal stats
InternalStats::CompactionStats stats(CompactionReason::kFlush, 1);
const uint64_t micros = clock_->NowMicros() - start_micros;
const uint64_t cpu_micros = clock_->CPUNanos() / 1000 - start_cpu_micros;
stats.micros = micros;
stats.cpu_micros = cpu_micros;
ROCKS_LOG_INFO(db_options_.info_log,
"[%s] [JOB %d] Flush lasted %" PRIu64
" microseconds, and %" PRIu64 " cpu microseconds.\n",
cfd_->GetName().c_str(), job_context_->job_id, micros,
cpu_micros);
if (has_output) {
stats.bytes_written = meta_.fd.GetFileSize();
stats.num_output_files = 1;
}
const auto& blobs = edit_->GetBlobFileAdditions();
for (const auto& blob : blobs) {
stats.bytes_written_blob += blob.GetTotalBlobBytes();
}
stats.num_output_files_blob = static_cast<int>(blobs.size());
RecordTimeToHistogram(stats_, FLUSH_TIME, stats.micros);
cfd_->internal_stats()->AddCompactionStats(0 /* level */, thread_pri_, stats);
cfd_->internal_stats()->AddCFStats(
InternalStats::BYTES_FLUSHED,
stats.bytes_written + stats.bytes_written_blob);
RecordFlushIOStats();
return s;
}
调用 BuildTable 函数构造 SSTable
Status BuildTable(
const std::string& dbname, VersionSet* versions,
const ImmutableDBOptions& db_options, const TableBuilderOptions& tboptions,
const FileOptions& file_options, TableCache* table_cache,
InternalIterator* iter,
std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
range_del_iters,
FileMetaData* meta, std::vector<BlobFileAddition>* blob_file_additions,
std::vector<SequenceNumber> snapshots,
SequenceNumber earliest_write_conflict_snapshot,
SnapshotChecker* snapshot_checker, bool paranoid_file_checks,
InternalStats* internal_stats, IOStatus* io_status,
const std::shared_ptr<IOTracer>& io_tracer,
BlobFileCreationReason blob_creation_reason, EventLogger* event_logger,
int job_id, const Env::IOPriority io_priority,
TableProperties* table_properties, Env::WriteLifeTimeHint write_hint,
const std::string* full_history_ts_low,
BlobFileCompletionCallback* blob_callback, uint64_t* num_input_entries,
uint64_t* memtable_payload_bytes, uint64_t* memtable_garbage_bytes) {
assert((tboptions.column_family_id ==
TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) ==
tboptions.column_family_name.empty());
auto& mutable_cf_options = tboptions.moptions;
auto& ioptions = tboptions.ioptions;
// Reports the IOStats for flush for every following bytes.
const size_t kReportFlushIOStatsEvery = 1048576;
OutputValidator output_validator(
tboptions.internal_comparator,
/*enable_order_check=*/
mutable_cf_options.check_flush_compaction_key_order,
/*enable_hash=*/paranoid_file_checks);
Status s;
meta->fd.file_size = 0;
iter->SeekToFirst();
std::unique_ptr<CompactionRangeDelAggregator> range_del_agg(
new CompactionRangeDelAggregator(&tboptions.internal_comparator,
snapshots));
uint64_t num_unfragmented_tombstones = 0;
uint64_t total_tombstone_payload_bytes = 0;
for (auto& range_del_iter : range_del_iters) {
num_unfragmented_tombstones +=
range_del_iter->num_unfragmented_tombstones();
total_tombstone_payload_bytes +=
range_del_iter->total_tombstone_payload_bytes();
range_del_agg->AddTombstones(std::move(range_del_iter));
}
std::string fname = TableFileName(ioptions.cf_paths, meta->fd.GetNumber(),
meta->fd.GetPathId());
std::vector<std::string> blob_file_paths;
std::string file_checksum = kUnknownFileChecksum;
std::string file_checksum_func_name = kUnknownFileChecksumFuncName;
#ifndef ROCKSDB_LITE
EventHelpers::NotifyTableFileCreationStarted(ioptions.listeners, dbname,
tboptions.column_family_name,
fname, job_id, tboptions.reason);
#endif // !ROCKSDB_LITE
Env* env = db_options.env; // 获得环境
assert(env);
FileSystem* fs = db_options.fs.get(); // 获得文件系统
assert(fs);
TableProperties tp;
if (iter->Valid() || !range_del_agg->IsEmpty()) {
std::unique_ptr<CompactionFilter> compaction_filter;
if (ioptions.compaction_filter_factory != nullptr &&
ioptions.compaction_filter_factory->ShouldFilterTableFileCreation(
tboptions.reason)) {
CompactionFilter::Context context;
context.is_full_compaction = false;
context.is_manual_compaction = false;
context.column_family_id = tboptions.column_family_id;
context.reason = tboptions.reason;
compaction_filter =
ioptions.compaction_filter_factory->CreateCompactionFilter(context);
if (compaction_filter != nullptr &&
!compaction_filter->IgnoreSnapshots()) {
s.PermitUncheckedError();
return Status::NotSupported(
"CompactionFilter::IgnoreSnapshots() = false is not supported "
"anymore.");
}
}
TableBuilder* builder;
std::unique_ptr<WritableFileWriter> file_writer;
{
std::unique_ptr<FSWritableFile> file; // 开始创建 FSWriteableFile 类型的文件
#ifndef NDEBUG
bool use_direct_writes = file_options.use_direct_writes;
TEST_SYNC_POINT_CALLBACK("BuildTable:create_file", &use_direct_writes);
#endif // !NDEBUG
IOStatus io_s = NewWritableFile(fs, fname, &file, file_options); // 创建文件
assert(s.ok());
s = io_s;
if (io_status->ok()) {
*io_status = io_s;
}
if (!s.ok()) {
EventHelpers::LogAndNotifyTableFileCreationFinished(
event_logger, ioptions.listeners, dbname,
tboptions.column_family_name, fname, job_id, meta->fd,
kInvalidBlobFileNumber, tp, tboptions.reason, s, file_checksum,
file_checksum_func_name);
return s;
}
FileTypeSet tmp_set = ioptions.checksum_handoff_file_types;
file->SetIOPriority(io_priority);
file->SetWriteLifeTimeHint(write_hint); // 设置文件的生命周期
file_writer.reset(new WritableFileWriter(
std::move(file), fname, file_options, ioptions.clock, io_tracer,
ioptions.stats, ioptions.listeners,
ioptions.file_checksum_gen_factory.get(),
tmp_set.Contains(FileType::kTableFile), false));
builder = NewTableBuilder(tboptions, file_writer.get());
}
MergeHelper merge(
env, tboptions.internal_comparator.user_comparator(),
ioptions.merge_operator.get(), compaction_filter.get(), ioptions.logger,
true /* internal key corruption is not ok */,
snapshots.empty() ? 0 : snapshots.back(), snapshot_checker);
std::unique_ptr<BlobFileBuilder> blob_file_builder(
(mutable_cf_options.enable_blob_files && blob_file_additions)
? new BlobFileBuilder(
versions, fs, &ioptions, &mutable_cf_options, &file_options,
job_id, tboptions.column_family_id,
tboptions.column_family_name, io_priority, write_hint,
io_tracer, blob_callback, blob_creation_reason,
&blob_file_paths, blob_file_additions)
: nullptr);
CompactionIterator c_iter(
iter, tboptions.internal_comparator.user_comparator(), &merge,
kMaxSequenceNumber, &snapshots, earliest_write_conflict_snapshot,
snapshot_checker, env, ShouldReportDetailedTime(env, ioptions.stats),
true /* internal key corruption is not ok */, range_del_agg.get(),
blob_file_builder.get(), ioptions.allow_data_in_errors,
/*compaction=*/nullptr, compaction_filter.get(),
/*shutting_down=*/nullptr,
/*preserve_deletes_seqnum=*/0, /*manual_compaction_paused=*/nullptr,
/*manual_compaction_canceled=*/nullptr, db_options.info_log,
full_history_ts_low);
c_iter.SeekToFirst();
for (; c_iter.Valid(); c_iter.Next()) {
const Slice& key = c_iter.key();
const Slice& value = c_iter.value();
const ParsedInternalKey& ikey = c_iter.ikey();
// Generate a rolling 64-bit hash of the key and values
// Note :
// Here "key" integrates 'sequence_number'+'kType'+'user key'.
s = output_validator.Add(key, value);
if (!s.ok()) {
break;
}
builder->Add(key, value);
meta->UpdateBoundaries(key, value, ikey.sequence, ikey.type);
// TODO(noetzli): Update stats after flush, too.
if (io_priority == Env::IO_HIGH &&
IOSTATS(bytes_written) >= kReportFlushIOStatsEvery) {
ThreadStatusUtil::SetThreadOperationProperty(
ThreadStatus::FLUSH_BYTES_WRITTEN, IOSTATS(bytes_written));
}
}
if (!s.ok()) {
c_iter.status().PermitUncheckedError();
} else if (!c_iter.status().ok()) {
s = c_iter.status();
}
if (s.ok()) {
auto range_del_it = range_del_agg->NewIterator();
for (range_del_it->SeekToFirst(); range_del_it->Valid();
range_del_it->Next()) {
auto tombstone = range_del_it->Tombstone();
auto kv = tombstone.Serialize();
builder->Add(kv.first.Encode(), kv.second);
meta->UpdateBoundariesForRange(kv.first, tombstone.SerializeEndKey(),
tombstone.seq_,
tboptions.internal_comparator);
}
}
TEST_SYNC_POINT("BuildTable:BeforeFinishBuildTable");
const bool empty = builder->IsEmpty();
if (num_input_entries != nullptr) {
*num_input_entries =
c_iter.num_input_entry_scanned() + num_unfragmented_tombstones;
}
if (!s.ok() || empty) {
builder->Abandon();
} else {
s = builder->Finish();
}
if (io_status->ok()) {
*io_status = builder->io_status();
}
if (s.ok() && !empty) {
uint64_t file_size = builder->FileSize();
meta->fd.file_size = file_size;
meta->marked_for_compaction = builder->NeedCompact();
assert(meta->fd.GetFileSize() > 0);
tp = builder->GetTableProperties(); // refresh now that builder is finished
if (memtable_payload_bytes != nullptr &&
memtable_garbage_bytes != nullptr) {
const CompactionIterationStats& ci_stats = c_iter.iter_stats();
uint64_t total_payload_bytes = ci_stats.total_input_raw_key_bytes +
ci_stats.total_input_raw_value_bytes +
total_tombstone_payload_bytes;
uint64_t total_payload_bytes_written =
(tp.raw_key_size + tp.raw_value_size);
// Prevent underflow, which may still happen at this point
// since we only support inserts, deletes, and deleteRanges.
if (total_payload_bytes_written <= total_payload_bytes) {
*memtable_payload_bytes = total_payload_bytes;
*memtable_garbage_bytes =
total_payload_bytes - total_payload_bytes_written;
} else {
*memtable_payload_bytes = 0;
*memtable_garbage_bytes = 0;
}
}
if (table_properties) {
*table_properties = tp;
}
}
delete builder;
// Finish and check for file errors
TEST_SYNC_POINT("BuildTable:BeforeSyncTable");
if (s.ok() && !empty) {
StopWatch sw(ioptions.clock, ioptions.stats, TABLE_SYNC_MICROS);
*io_status = file_writer->Sync(ioptions.use_fsync);
}
TEST_SYNC_POINT("BuildTable:BeforeCloseTableFile");
if (s.ok() && io_status->ok() && !empty) {
*io_status = file_writer->Close();
}
if (s.ok() && io_status->ok() && !empty) {
// Add the checksum information to file metadata.
meta->file_checksum = file_writer->GetFileChecksum();
meta->file_checksum_func_name = file_writer->GetFileChecksumFuncName();
file_checksum = meta->file_checksum;
file_checksum_func_name = meta->file_checksum_func_name;
}
if (s.ok()) {
s = *io_status;
}
if (blob_file_builder) {
if (s.ok()) {
s = blob_file_builder->Finish();
} else {
blob_file_builder->Abandon(s);
}
blob_file_builder.reset();
}
// TODO Also check the IO status when create the Iterator.
if (s.ok() && !empty) {
// Verify that the table is usable
// We set for_compaction to false and don't OptimizeForCompactionTableRead
// here because this is a special case after we finish the table building
// No matter whether use_direct_io_for_flush_and_compaction is true,
// we will regrad this verification as user reads since the goal is
// to cache it here for further user reads
ReadOptions read_options;
std::unique_ptr<InternalIterator> it(table_cache->NewIterator(
read_options, file_options, tboptions.internal_comparator, *meta,
nullptr /* range_del_agg */,
mutable_cf_options.prefix_extractor.get(), nullptr,
(internal_stats == nullptr) ? nullptr
: internal_stats->GetFileReadHist(0),
TableReaderCaller::kFlush, /*arena=*/nullptr,
/*skip_filter=*/false, tboptions.level_at_creation,
MaxFileSizeForL0MetaPin(mutable_cf_options),
/*smallest_compaction_key=*/nullptr,
/*largest_compaction_key*/ nullptr,
/*allow_unprepared_value*/ false));
s = it->status();
if (s.ok() && paranoid_file_checks) {
OutputValidator file_validator(tboptions.internal_comparator,
/*enable_order_check=*/true,
/*enable_hash=*/true);
for (it->SeekToFirst(); it->Valid(); it->Next()) {
// Generate a rolling 64-bit hash of the key and values
file_validator.Add(it->key(), it->value()).PermitUncheckedError();
}
s = it->status();
if (s.ok() && !output_validator.CompareValidator(file_validator)) {
s = Status::Corruption("Paranoid checksums do not match");
}
}
}
}
// Check for input iterator errors
if (!iter->status().ok()) {
s = iter->status();
}
if (!s.ok() || meta->fd.GetFileSize() == 0) {
TEST_SYNC_POINT("BuildTable:BeforeDeleteFile");
constexpr IODebugContext* dbg = nullptr;
Status ignored = fs->DeleteFile(fname, IOOptions(), dbg);
ignored.PermitUncheckedError();
assert(blob_file_additions || blob_file_paths.empty());
if (blob_file_additions) {
for (const std::string& blob_file_path : blob_file_paths) {
ignored = DeleteDBFile(&db_options, blob_file_path, dbname,
/*force_bg=*/false, /*force_fg=*/false);
ignored.PermitUncheckedError();
TEST_SYNC_POINT("BuildTable::AfterDeleteFile");
}
}
}
Status status_for_listener = s;
if (meta->fd.GetFileSize() == 0) {
fname = "(nil)";
if (s.ok()) {
status_for_listener = Status::Aborted("Empty SST file not kept");
}
}
// Output to event logger and fire events.
EventHelpers::LogAndNotifyTableFileCreationFinished(
event_logger, ioptions.listeners, dbname, tboptions.column_family_name,
fname, job_id, meta->fd, meta->oldest_blob_file_number, tp,
tboptions.reason, status_for_listener, file_checksum,
file_checksum_func_name);
return s;
}
BuildTable 函数借助 Env、FileSystem、WritableFileWriter 和 FSWritableFile 等类,可以实现文件的持久化,开始对接操作系统。
在上一节中,最后调用了 BuildTable 的构造函数,其中有这样一句代码
IOStatus io_s = NewWritableFile(fs, fname, &file, file_options);
进入这个函数,就找到了 RocksDB 和 ZenFS 文件系统插件的连接点,
IOStatus NewWritableFile(FileSystem* fs, const std::string& fname,
std::unique_ptr<FSWritableFile>* result,
const FileOptions& options) {
TEST_SYNC_POINT_CALLBACK("NewWritableFile::FileOptions.temperature",
const_cast<Temperature*>(&options.temperature));
IOStatus s = fs->NewWritableFile(fname, options, result, nullptr); // 此句即调用 ZenFS::NewWritableFile
TEST_KILL_RANDOM_WITH_WEIGHT("NewWritableFile:0", REDUCE_ODDS2);
return s;
}
IOStatus ZenFS::NewWritableFile(const std::string& fname,
const FileOptions& file_opts,
std::unique_ptr<FSWritableFile>* result,
IODebugContext* /*dbg*/) {
IOStatus s;
Debug(logger_, "New writable file: %s direct: %d\n", fname.c_str(),
file_opts.use_direct_writes);
if (GetFile(fname) != nullptr) {
s = DeleteFile(fname);
if (!s.ok()) return s;
}
std::shared_ptr<ZoneFile> zoneFile(
new ZoneFile(zbd_, fname, next_file_id_++));
zoneFile->SetFileModificationTime(time(0));
/* Persist the creation of the file */
s = SyncFileMetadata(zoneFile);
if (!s.ok()) {
zoneFile.reset();
return s;
}
files_mtx_.lock();
files_.insert(std::make_pair(fname.c_str(), zoneFile));
files_mtx_.unlock();
result->reset(new ZonedWritableFile(zbd_, !file_opts.use_direct_writes,
zoneFile, &metadata_writer_));
return s;
}
下一篇文章将以具体的文件系统插件为例,继续分析 ZenFS 插件的持久化实现方式。