当前位置: 首页 > 工具软件 > TCMalloc > 使用案例 >

Tcmalloc内存分配源码分析

姚和顺
2023-12-01

一、介绍

在上篇介绍了tcmalloc的内存管理相关算法,那么这次就看一下,源码中是如何实现这些算法的。源码的阅读和算法的实现肯定是密切相关的,但在一些细节的处理上,源码可能会更好的体现出来对内存的控制。本篇只对分配的流程进行分析,对tcmalloc对整个内存的控制管理源码下篇再继续分析。

二、基本数据结构

按照上篇算法中的说明,来分析相关的源码。在tcmalloc中基础的数据结构主要有以下两种:

1、抽象基础数据
线程专属缓冲区ThreadCache :

//-------------------------------------------------------------------
// Data kept per thread
//-------------------------------------------------------------------

class ThreadCache {
 public:
  void Init(pthread_t tid) ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
  void Cleanup();

  // Accessors (mostly just for printing stats)
  int freelist_length(size_t size_class) const {
    return list_[size_class].length();
  }

  // Total byte size in cache
  size_t Size() const { return size_; }

  // Allocate an object of the given size class. When allocation fails
  // (from this cache and after running FetchFromCentralCache),
  // OOMHandler(size) is called and its return value is
  // returned from Allocate. OOMHandler is used to parameterize
  // out-of-memory handling (raising exception, returning nullptr,
  // calling new_handler or anything else). "Passing" OOMHandler in
  // this way allows Allocate to be used in tail-call position in
  // fast-path, making allocate tail-call slow path code.
  template <void* OOMHandler(size_t)>
  void* Allocate(size_t size_class);

  void Deallocate(void* ptr, size_t size_class);

  void Scavenge();

  Sampler* GetSampler();

  static void InitTSD();
  static ThreadCache* GetCache();
  static ThreadCache* GetCacheIfPresent();
  static ThreadCache* CreateCacheIfNecessary();
  static void BecomeIdle();

  // returns stats on total thread caches created/used
  static inline AllocatorStats HeapStats()
      ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);

  // Adds to *total_bytes the total number of bytes used by all thread heaps.
  // Also, if class_count is not NULL, it must be an array of size kNumClasses,
  // and this function will increment each element of class_count by the number
  // of items in all thread-local freelists of the corresponding size class.
  static void GetThreadStats(uint64_t* total_bytes, uint64_t* class_count)
      ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);

  // Sets the total thread cache size to new_size, recomputing the
  // individual thread cache sizes as necessary.
  static void set_overall_thread_cache_size(size_t new_size)
      ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);

  static size_t overall_thread_cache_size()
      ABSL_SHARED_LOCKS_REQUIRED(pageheap_lock) {
    return overall_thread_cache_size_;
  }

  template <void* OOMHandler(size_t)>
  void* ABSL_ATTRIBUTE_NOINLINE AllocateSlow(size_t size_class,
                                             size_t allocated_size) {
    void* ret = FetchFromCentralCache(size_class, allocated_size);
    if (ABSL_PREDICT_TRUE(ret != nullptr)) {
      return ret;
    }
    return OOMHandler(allocated_size);
  }

 private:
  // We inherit rather than include the list as a data structure to reduce
  // compiler padding.  Without inheritance, the compiler pads the list
  // structure and then adds it as a member, even though we could fit everything
  // without padding.
  class FreeList : public LinkedList {
   private:
    uint32_t lowater_;     // Low water mark for list length.
    uint32_t max_length_;  // Dynamic max list length based on usage.
    // Tracks the number of times a deallocation has caused
    // length_ > max_length_.  After the kMaxOverages'th time, max_length_
    // shrinks and length_overages_ is reset to zero.
    uint32_t length_overages_;

    // This extra unused field pads FreeList size to 32 bytes on 64
    // bit machines, helping compiler generate faster code for
    // indexing array of lists.
    void* ABSL_ATTRIBUTE_UNUSED extra_;

   public:
    void Init() {
      LinkedList::Init();
      lowater_ = 0;
      max_length_ = 1;
      length_overages_ = 0;
    }

    // Return the maximum length of the list.
    size_t max_length() const { return max_length_; }

    // Set the maximum length of the list.  If 'new_max' > length(), the
    // client is responsible for removing objects from the list.
    void set_max_length(size_t new_max) { max_length_ = new_max; }

    // Return the number of times that length() has gone over max_length().
    size_t length_overages() const { return length_overages_; }

    void set_length_overages(size_t new_count) { length_overages_ = new_count; }

    // Low-water mark management
    int lowwatermark() const { return lowater_; }
    void clear_lowwatermark() { lowater_ = length(); }

    ABSL_ATTRIBUTE_ALWAYS_INLINE bool TryPop(void** ret) {
      bool out = LinkedList::TryPop(ret);
      if (ABSL_PREDICT_TRUE(out) && ABSL_PREDICT_FALSE(length() < lowater_)) {
        lowater_ = length();
      }
      return out;
    }

    void PopBatch(int N, void** batch) {
      LinkedList::PopBatch(N, batch);
      if (length() < lowater_) lowater_ = length();
    }
  };

全局内存管理的CentralCache:

// Data kept per size-class in central cache.
template <typename ForwarderT>
class CentralFreeList {
 public:
  using Forwarder = ForwarderT;

  constexpr CentralFreeList()
      : lock_(absl::kConstInit, absl::base_internal::SCHEDULE_KERNEL_ONLY),
        size_class_(0),
        object_size_(0),
        objects_per_span_(0),
        first_nonempty_index_(0),
        pages_per_span_(0),
        nonempty_() {}

  CentralFreeList(const CentralFreeList&) = delete;
  CentralFreeList& operator=(const CentralFreeList&) = delete;

  void Init(size_t size_class) ABSL_LOCKS_EXCLUDED(lock_);

  // These methods all do internal locking.

  // Insert batch into the central freelist.
  // REQUIRES: batch.size() > 0 && batch.size() <= kMaxObjectsToMove.
  void InsertRange(absl::Span<void*> batch) ABSL_LOCKS_EXCLUDED(lock_);

  // Fill a prefix of batch[0..N-1] with up to N elements removed from central
  // freelist.  Return the number of elements removed.
  ABSL_MUST_USE_RESULT int RemoveRange(void** batch, int N)
      ABSL_LOCKS_EXCLUDED(lock_);

  // Returns the number of free objects in cache.
  size_t length() const { return static_cast<size_t>(counter_.value()); }

  // Returns the memory overhead (internal fragmentation) attributable
  // to the freelist.  This is memory lost when the size of elements
  // in a freelist doesn't exactly divide the page-size (an 8192-byte
  // page full of 5-byte objects would have 2 bytes memory overhead).
  size_t OverheadBytes() const;

  // Returns number of live spans currently in the nonempty_[n] list.
  // REQUIRES: n >= 0 && n < kNumLists.
  size_t NumSpansInList(int n) ABSL_LOCKS_EXCLUDED(lock_);
  SpanStats GetSpanStats() const;

  // Reports span utilization histogram stats.
  void PrintSpanUtilStats(Printer* out) const;
  void PrintSpanUtilStatsInPbtxt(PbtxtRegion* region) const;

  // Get number of spans in the histogram bucket. We record spans in the
  // histogram indexed by absl::bit_width(allocated). So, instead of using the
  // absolute number of allocated objects, it uses absl::bit_width(allocated),
  // passed as <bitwidth>, to index and return the number of spans in the
  // histogram.
  size_t NumSpansWith(uint16_t bitwidth) const;

  Forwarder& forwarder() { return forwarder_; }

 private:
  // Release an object to spans.
  // Returns object's span if it become completely free.
  Span* ReleaseToSpans(void* object, Span* span, size_t object_size)
      ABSL_EXCLUSIVE_LOCKS_REQUIRED(lock_);

  // Populate cache by fetching from the page heap.
  // May temporarily release lock_.
  // Fill a prefix of batch[0..N-1] with up to N elements removed from central
  // freelist. Returns the number of elements removed.
  int Populate(void** batch, int N) ABSL_EXCLUSIVE_LOCKS_REQUIRED(lock_);

  // Parses nonempty_ lists and returns span from the list with the lowest
  // possible index when span prioritization is enabled.
  // Returns the span if one exists in the nonempty_ lists. Else, returns
  // nullptr.
  Span* FirstNonEmptySpan() ABSL_EXCLUSIVE_LOCKS_REQUIRED(lock_);

  // Returns first index to the nonempty_ lists that may record spans. When span
  // prioritization is enabled, this returns first_nonempty_index_. When
  // disabled, this returns kNumLists - 1 as this is the index to the nonempty_
  // list we use to record all the spans.
  uint8_t GetFirstNonEmptyIndex();

  // Returns index into nonempty_ based on the number of allocated objects for
  // the span. Instead of using the absolute number of allocated objects, it
  // uses absl::bit_width(allocated), passed as bitwidth, to calculate the list
  // index.
  uint8_t IndexFor(uint8_t bitwidth);

  // Records span utilization in objects_to_span_ map. Instead of using the
  // absolute number of allocated objects, it uses
  // absl::bit_width(allocated), passed as <bitwidth>, to index this map.
  //
  // If increase is set to true, includes the span by incrementing the count
  // in the map. Otherwise, removes the span by decrementing the count in
  // the map.
  void RecordSpanUtil(uint8_t bitwidth, bool increase)
      ABSL_EXCLUSIVE_LOCKS_REQUIRED(lock_) {
    ASSUME(bitwidth > 0);
    // Updates to objects_to_span_ are guarded by lock_, so writes may be
    // performed using LossyAdd.
    objects_to_spans_[bitwidth - 1].LossyAdd(increase ? 1 : -1);
  }

  // This lock protects all the mutable data members.
  absl::base_internal::SpinLock lock_;

  size_t size_class_;  // My size class (immutable after Init())
  size_t object_size_;
  size_t objects_per_span_;
  // Hint used for parsing through the nonempty_ lists. This prevents us from
  // parsing the lists with an index starting zero, if the lowest possible index
  // is higher than that.
  size_t first_nonempty_index_;
  Length pages_per_span_;

  size_t num_spans() const {
    size_t requested = num_spans_requested_.value();
    size_t returned = num_spans_returned_.value();
    if (requested < returned) return 0;
    return (requested - returned);
  }

  void RecordSpanAllocated() ABSL_EXCLUSIVE_LOCKS_REQUIRED(lock_) {
    counter_.LossyAdd(objects_per_span_);
    num_spans_requested_.LossyAdd(1);
  }

  void RecordMultiSpansDeallocated(size_t num_spans_returned)
      ABSL_EXCLUSIVE_LOCKS_REQUIRED(lock_) {
    counter_.LossyAdd(-num_spans_returned * objects_per_span_);
    num_spans_returned_.LossyAdd(num_spans_returned);
  }

  void UpdateObjectCounts(int num) ABSL_EXCLUSIVE_LOCKS_REQUIRED(lock_) {
    counter_.LossyAdd(num);
  }

  // The followings are kept as a StatsCounter so that they can read without
  // acquiring a lock. Updates to these variables are guarded by lock_
  // so writes are performed using LossyAdd for speed, the lock still
  // guarantees accuracy.

  // Num free objects in cache entry
  StatsCounter counter_;

  StatsCounter num_spans_requested_;
  StatsCounter num_spans_returned_;

  // Records histogram of span utilization.
  //
  // Each bucket in the histogram records number of live spans with
  // corresponding number of allocated objects. Instead of using the absolute
  // value of number of allocated objects, we use absl::bit_width(allocated) to
  // index this map. A bucket in the histogram corresponds to power-of-two
  // number of objects. That is, bucket N tracks number of spans with allocated
  // objects < 2^(N+1). For instance, objects_to_spans_ map tracks number of
  // spans with allocated objects in the range [a,b), indexed as: [1,2) in
  // objects_to_spans_[0], [2,4) in objects_to_spans_[1], [4, 8) in
  // objects_to_spans_[2] and so on. We can query the objects_to_spans_ map
  // using NumSpansWith(bitwidth) to obtain the number of spans associated
  // with the corresponding bucket in the histogram.
  //
  // As the actual value of objects_per_span_ is not known at compile time, we
  // use maximum value that it can be to initialize this hashmap, and
  // kSpanUtilBucketCapacity determines this value. We also check during Init
  // that absl::bit_width(objects_per_span_) is indeed less than or equal to
  // kSpanUtilBucketCapacity.
  //
  // We disable collection of histogram stats for TCMalloc small-but-slow due to
  // performance issues. See b/227362263.
  static constexpr size_t kSpanUtilBucketCapacity = 16;
  StatsCounter objects_to_spans_[kSpanUtilBucketCapacity];

  // Non-empty lists that distinguish spans based on the number of objects
  // allocated from them. When span prioritization is enabled, spans may be
  // added to any of the kNumLists nonempty_ lists based on their allocated
  // objects. If span prioritization is disabled, we add spans to the
  // nonempty_[kNumlists-1] list, leaving other lists unused.
  //
  // We do not enable multiple nonempty lists for small-but-slow yet due to
  // performance issues. See b/227362263.
#ifdef TCMALLOC_SMALL_BUT_SLOW
  SpanList nonempty_ ABSL_GUARDED_BY(lock_);
#else
  HintedTrackerLists<Span, kNumLists> nonempty_ ABSL_GUARDED_BY(lock_);
#endif

  TCMALLOC_NO_UNIQUE_ADDRESS Forwarder forwarder_;
};

堆的内存数据结构PageHeap:

class PageHeap final : public PageAllocatorInterface {
 public:
  explicit PageHeap(MemoryTag tag);
  // for testing
  PageHeap(PageMap* map, MemoryTag tag);
  ~PageHeap() override = default;

  // Allocate a run of "n" pages. These would used for allocating 'num_objects'
  // objects. Returns zero if out of memory.  Caller should not pass "n == 0" --
  // instead, n should have been rounded up already.  The returned memory is
  // backed.
  Span* New(Length n, size_t objects_per_span)
      ABSL_LOCKS_EXCLUDED(pageheap_lock) override;

  // As New, but the returned span is aligned to a <align>-page boundary.
  // <align> must be a power of two.
  Span* NewAligned(Length n, Length align, size_t objects_per_span)
      ABSL_LOCKS_EXCLUDED(pageheap_lock) override;

  // Delete the span "[p, p+n-1]".
  // REQUIRES: span was returned by earlier call to New() and
  //           has not yet been deleted.
  void Delete(Span* span, size_t objects_per_span)
      ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock) override;

  inline BackingStats stats() const
      ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock) override {
    return stats_;
  }

  void GetSmallSpanStats(SmallSpanStats* result)
      ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock) override;

  void GetLargeSpanStats(LargeSpanStats* result)
      ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock) override;

  // Try to release at least num_pages for reuse by the OS.  Returns
  // the actual number of pages released, which may be less than
  // num_pages if there weren't enough pages to release. The result
  // may also be larger than num_pages since page_heap might decide to
  // release one large range instead of fragmenting it into two
  // smaller released and unreleased ranges.
  Length ReleaseAtLeastNPages(Length num_pages)
      ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock) override;

  // Prints stats about the page heap to *out.
  void Print(Printer* out) ABSL_LOCKS_EXCLUDED(pageheap_lock) override;

  void PrintInPbtxt(PbtxtRegion* region)
      ABSL_LOCKS_EXCLUDED(pageheap_lock) override;

 private:
  // We segregate spans of a given size into two circular linked
  // lists: one for normal spans, and one for spans whose memory
  // has been returned to the system.
  struct SpanListPair {
    SpanList normal;
    SpanList returned;
  };

  // List of free spans of length >= kMaxPages
  SpanListPair large_ ABSL_GUARDED_BY(pageheap_lock);

  // Array mapping from span length to a doubly linked list of free spans
  SpanListPair free_[kMaxPages.raw_num()] ABSL_GUARDED_BY(pageheap_lock);

  // Statistics on system, free, and unmapped bytes
  BackingStats stats_ ABSL_GUARDED_BY(pageheap_lock);

  Span* SearchFreeAndLargeLists(Length n, bool* from_returned)
      ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);

  bool GrowHeap(Length n) ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);

  // REQUIRES: span->length >= n
  // REQUIRES: span->location != IN_USE
  // Remove span from its free list, and move any leftover part of
  // span into appropriate free lists.  Also update "span" to have
  // length exactly "n" and mark it as non-free so it can be returned
  // to the client.  After all that, decrease free_pages_ by n and
  // return span.
  Span* Carve(Span* span, Length n)
      ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);

  // Allocate a large span of length == n.  If successful, returns a
  // span of exactly the specified length.  Else, returns NULL.
  Span* AllocLarge(Length n, bool* from_returned)
      ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);

  // Coalesce span with neighboring spans if possible, prepend to
  // appropriate free list, and adjust stats.
  void MergeIntoFreeList(Span* span)
      ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);

  // Prepends span to appropriate free list, and adjusts stats.  You'll probably
  // want to adjust span->freelist_added_time before/after calling this
  // function.
  void PrependToFreeList(Span* span)
      ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);

  // Removes span from its free list, and adjust stats.
  void RemoveFromFreeList(Span* span)
      ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);

  // Release the last span on the normal portion of this list.
  // Return the length of that span.
  Length ReleaseLastNormalSpan(SpanListPair* slist)
      ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);

  // Do invariant testing.
  bool Check() ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);

  // Index of last free list where we released memory to the OS.
  int release_index_ ABSL_GUARDED_BY(pageheap_lock);

  Span* AllocateSpan(Length n, bool* from_returned)
      ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);

  void RecordSpan(Span* span) ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
};

为了快速定位Span的PageMap:

// Two-level radix tree
typedef void* (*PagemapAllocator)(size_t);
void* MetaDataAlloc(size_t bytes);

template <int BITS, PagemapAllocator Allocator>
class PageMap2 {
 private:
  // The leaf node (regardless of pointer size) always maps 2^15 entries;
  // with 8K pages, this gives us 256MB mapped per leaf node.
  static constexpr int kLeafBits = 15;
  static constexpr int kLeafLength = 1 << kLeafBits;
  static constexpr int kRootBits = (BITS >= kLeafBits) ? (BITS - kLeafBits) : 0;
  // (1<<kRootBits) must not overflow an "int"
  static_assert(kRootBits < sizeof(int) * 8 - 1, "kRootBits is too large");
  static constexpr int kRootLength = 1 << kRootBits;

  static constexpr size_t kLeafCoveredBytes = 1ul << (kLeafBits + kPageShift);
  static_assert(kLeafCoveredBytes >= kHugePageSize, "leaf too small");
  static constexpr size_t kLeafHugeBits =
      (kLeafBits + kPageShift - kHugePageShift);
  static constexpr size_t kLeafHugepages = kLeafCoveredBytes / kHugePageSize;
  static_assert(kLeafHugepages == 1 << kLeafHugeBits, "sanity");
  struct Leaf {
    // We keep parallel arrays indexed by page number.  One keeps the
    // size class; another span pointers; the last hugepage-related
    // information.  The size class information is kept segregated
    // since small object deallocations are so frequent and do not
    // need the other information kept in a Span.
    CompactSizeClass sizeclass[kLeafLength];
    Span* span[kLeafLength];
    void* hugepage[kLeafHugepages];
  };

基本的内存单元Span:

class Span;
typedef TList<Span> SpanList;

class Span : public SpanList::Elem {
 public:
  // Allocator/deallocator for spans. Note that these functions are defined
  // in static_vars.h, which is weird: see there for why.
  static Span* New(PageId p, Length len)
      ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
  static void Delete(Span* span) ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);

  // locations used to track what list a span resides on.
  enum Location {
    IN_USE,                // not on PageHeap lists
    ON_NORMAL_FREELIST,    // on normal PageHeap list
    ON_RETURNED_FREELIST,  // on returned PageHeap list
  };
  Location location() const;
  void set_location(Location loc);

  // ---------------------------------------------------------------------------
  // Support for sampled allocations.
  // There is one-to-one correspondence between a sampled allocation and a span.
  // ---------------------------------------------------------------------------

  // Mark this span in the "SAMPLED" state. It will store the corresponding
  // sampled allocation and update some global counters on the total size of
  // sampled allocations.
  void Sample(SampledAllocation* sampled_allocation);

  // Unmark this span from its "SAMPLED" state. It will return the sampled
  // allocation previously passed to Span::Sample() or nullptr if this is a
  // non-sampling span. It will also update the global counters on the total
  // size of sampled allocations.
  // REQUIRES: this is a SAMPLED span.
  SampledAllocation* Unsample();

  // Returns the sampled allocation of the span.
  // pageheap_lock is not required, but caller either needs to hold the lock or
  // ensure by some other means that the sampling state can't be changed
  // concurrently.
  // REQUIRES: this is a SAMPLED span.
  SampledAllocation* sampled_allocation() const;

  // Is it a sampling span?
  // For debug checks. pageheap_lock is not required, but caller needs to ensure
  // that sampling state can't be changed concurrently.
  bool sampled() const;

  // ---------------------------------------------------------------------------
  // Span memory range.
  // ---------------------------------------------------------------------------

  // Returns first page of the span.
  PageId first_page() const;

  // Returns the last page in the span.
  PageId last_page() const;

  // Sets span first page.
  void set_first_page(PageId p);

  // Returns start address of the span.
  ABSL_ATTRIBUTE_RETURNS_NONNULL void* start_address() const;

  // Returns number of pages in the span.
  Length num_pages() const;

  // Sets number of pages in the span.
  void set_num_pages(Length len);

  // Total memory bytes in the span.
  size_t bytes_in_span() const;

  // ---------------------------------------------------------------------------
  // Age tracking (for free spans in PageHeap).
  // ---------------------------------------------------------------------------

  uint64_t freelist_added_time() const;
  void set_freelist_added_time(uint64_t t);

  // Sets this span freelist added time to average of this and other times
  // weighted by their sizes.
  // REQUIRES: this is a ON_NORMAL_FREELIST or ON_RETURNED_FREELIST span.
  void AverageFreelistAddedTime(const Span* other);

  // Returns internal fragmentation of the span.
  // REQUIRES: this is a SMALL_OBJECT span.
  double Fragmentation(size_t object_size) const;

  // Returns number of objects allocated in the span.
  uint16_t Allocated() const {
    return allocated_.load(std::memory_order_relaxed);
  }

2、OS系统数据结构
管理mmap相关的内存:

class RegionManager {
 public:
  std::pair<void*, size_t> Alloc(size_t size, size_t alignment, MemoryTag tag);

  void DiscardMappedRegions() {
    std::fill(normal_region_.begin(), normal_region_.end(), nullptr);
    sampled_region_ = nullptr;
    cold_region_ = nullptr;
  }

 private:
  // Checks that there is sufficient space available in the reserved region
  // for the next allocation, if not allocate a new region.
  // Then returns a pointer to the new memory.
  std::pair<void*, size_t> Allocate(size_t size, size_t alignment,
                                    MemoryTag tag);

  std::array<AddressRegion*, kNumaPartitions> normal_region_{{nullptr}};
  AddressRegion* sampled_region_{nullptr};
  AddressRegion* cold_region_{nullptr};
};

一个内存Page的大小为8K,用一个数据结构来描述:

// A single aligned page.
class PageId {
 public:
  constexpr PageId() : pn_(0) {}
  constexpr PageId(const PageId& p) = default;
  constexpr PageId& operator=(const PageId& p) = default;

  constexpr explicit PageId(uintptr_t pn) : pn_(pn) {}

  void* start_addr() const {
    return reinterpret_cast<void*>(pn_ << kPageShift);
  }

  uintptr_t start_uintptr() const { return pn_ << kPageShift; }

  size_t index() const { return pn_; }

  constexpr PageId& operator+=(Length rhs) {
    pn_ += rhs.raw_num();
    return *this;
  }

  constexpr PageId& operator-=(Length rhs) {
    ASSERT(pn_ >= rhs.raw_num());
    pn_ -= rhs.raw_num();
    return *this;
  }

 private:
  friend constexpr bool operator<(PageId lhs, PageId rhs);
  friend constexpr bool operator>(PageId lhs, PageId rhs);
  friend constexpr bool operator<=(PageId lhs, PageId rhs);
  friend constexpr bool operator>=(PageId lhs, PageId rhs);
  friend constexpr bool operator==(PageId lhs, PageId rhs);
  friend constexpr bool operator!=(PageId lhs, PageId rhs);
  friend constexpr Length operator-(PageId lhs, PageId rhs);

  uintptr_t pn_;
};

上面的基础的数据结构的代码,如果没有一些c++的底子,比如模板、重载运算符等,还真是有些难度的,如果真是一点底子都没有,就不要往下看了,浪费时间。

三、分配流程源码

分配流程按照从上到下的顺序逐步拆解,看看调用流程的代码:
1、首先是上层分配
上层分配,基本就是大家觉的new,delete或者对其它的重载。当然也包括placement new这个在指定的内存上构造生成对象。它有点类似于C语言的calloc这个函数。看一下下面的代码:


//重载new  libc_override_redefine.h
void* operator new(size_t size) { return TCMallocInternalNew(size); }
void operator delete(void* p) noexcept { TCMallocInternalDelete(p); }
void* operator new[](size_t size) { return TCMallocInternalNewArray(size); }
void operator delete[](void* p) noexcept { TCMallocInternalDeleteArray(p); }
void* operator new(size_t size, const std::nothrow_t& nt) noexcept {
  return TCMallocInternalNewNothrow(size, nt);
}
void* operator new[](size_t size, const std::nothrow_t& nt) noexcept {
  return TCMallocInternalNewArrayNothrow(size, nt);
}
void operator delete(void* ptr, const std::nothrow_t& nt) noexcept {
  return TCMallocInternalDeleteNothrow(ptr, nt);
}
void operator delete[](void* ptr, const std::nothrow_t& nt) noexcept {
  return TCMallocInternalDeleteArrayNothrow(ptr, nt);
}

//全局new,也就是默认的new(函数内部的new)
ABSL_ATTRIBUTE_WEAK void* operator new(size_t size, const std::nothrow_t,
                                       tcmalloc::hot_cold_t hot_cold) noexcept {
  return ::operator new(size, std::nothrow);
}

#ifdef __cpp_aligned_new
ABSL_ATTRIBUTE_WEAK void* operator new(
    size_t size, std::align_val_t alignment,
    tcmalloc::hot_cold_t hot_cold) noexcept(false) {
  return ::operator new(size, alignment);
}
//指定重用内存
void InitSystemAllocatorIfNecessary() {
  if (region_factory) return;
  pagesize = getpagesize();
  // Sets the preferred alignment to be the largest of either the alignment
  // returned by mmap() or our minimum allocation size. The minimum allocation
  // size is usually a multiple of page size, but this need not be true for
  // SMALL_BUT_SLOW where we do not allocate in units of huge pages.
  preferred_alignment = std::max(pagesize, kMinSystemAlloc);
  region_manager = new (&region_manager_space) RegionManager();
  region_factory = new (&mmap_space) MmapRegionFactory();
}

也就是说,在tcmalloc中对这些都给予了支持。tcmalloc中对全局的new和delete都进行了重载,这样即使调用全局的内存分配函数也不会调用c++默认的分配函数。

2、从ThreadCache分配
自上面的开始重载全局内存分配和回收的运算符,就进入和tcmalloc自己的内存管理系统:

size_t ThreadCache::per_thread_cache_size_ = kMaxThreadCacheSize;
size_t ThreadCache::overall_thread_cache_size_ = kDefaultOverallThreadCacheSize;
int64_t ThreadCache::unclaimed_cache_space_ = kDefaultOverallThreadCacheSize;
ThreadCache* ThreadCache::thread_heaps_ = nullptr;
int ThreadCache::thread_heap_count_ = 0;
ThreadCache* ThreadCache::next_memory_steal_ = nullptr;
#ifdef ABSL_HAVE_TLS
__thread ThreadCache* ThreadCache::thread_local_data_
    ABSL_ATTRIBUTE_INITIAL_EXEC = nullptr;
#endif
ABSL_CONST_INIT bool ThreadCache::tsd_inited_ = false;
pthread_key_t ThreadCache::heap_key_;

在thread_cache.cc这个文件中,首先是各种变量的定义,包括堆指针,相关的大小,重点要看一下最后定义的那个线程私有的KEY,它可是用来区分不同的线程的本质的方式,也就是区分不同的threadcache的方式。
下面看一下具体的头文件中的几个分配函数:

template <void* OOMHandler(size_t)>
inline void* ABSL_ATTRIBUTE_ALWAYS_INLINE
ThreadCache::Allocate(size_t size_class) {
  const size_t allocated_size = tc_globals.sizemap().class_to_size(size_class);

  FreeList* list = &list_[size_class];
  void* ret;
  if (ABSL_PREDICT_TRUE(list->TryPop(&ret))) {
    size_ -= allocated_size;
    return ret;
  }

  return AllocateSlow<OOMHandler>(size_class, allocated_size);
}

inline void ABSL_ATTRIBUTE_ALWAYS_INLINE
ThreadCache::Deallocate(void* ptr, size_t size_class) {
  FreeList* list = &list_[size_class];
  size_ += tc_globals.sizemap().class_to_size(size_class);
  ssize_t size_headroom = max_size_ - size_ - 1;

  list->Push(ptr);
  ssize_t list_headroom =
      static_cast<ssize_t>(list->max_length()) - list->length();

  // There are two relatively uncommon things that require further work.
  // In the common case we're done, and in that case we need a single branch
  // because of the bitwise-or trick that follows.
  if ((list_headroom | size_headroom) < 0) {
    DeallocateSlow(ptr, list, size_class);
  }
}

inline ThreadCache* ABSL_ATTRIBUTE_ALWAYS_INLINE
ThreadCache::GetCacheIfPresent() {
#ifdef ABSL_HAVE_TLS
  // __thread is faster
  return thread_local_data_;
#else
  return tsd_inited_
             ? reinterpret_cast<ThreadCache*>(pthread_getspecific(heap_key_))
             : nullptr;
#endif
}

inline ThreadCache* ThreadCache::GetCache() {
  ThreadCache* tc = GetCacheIfPresent();
  return (ABSL_PREDICT_TRUE(tc != nullptr)) ? tc : CreateCacheIfNecessary();
}
// Remove some objects of class "size_class" from central cache and add to
// thread heap. On success, return the first object for immediate use; otherwise
// return NULL.
void* ThreadCache::FetchFromCentralCache(size_t size_class, size_t byte_size) {
  FreeList* list = &list_[size_class];
  ASSERT(list->empty());
  const int batch_size = tc_globals.sizemap().num_objects_to_move(size_class);

  const int num_to_move = std::min<int>(list->max_length(), batch_size);
  void* batch[kMaxObjectsToMove];
  int fetch_count =
      tc_globals.transfer_cache().RemoveRange(size_class, batch, num_to_move);
  if (fetch_count == 0) {
    return nullptr;
  }

  if (--fetch_count > 0) {
    size_ += byte_size * fetch_count;
    list->PushBatch(fetch_count, batch + 1);
  }

  // Increase max length slowly up to batch_size.  After that,
  // increase by batch_size in one shot so that the length is a
  // multiple of batch_size.
  if (list->max_length() < batch_size) {
    list->set_max_length(list->max_length() + 1);
  } else {
    // Don't let the list get too long.  In 32 bit builds, the length
    // is represented by a 16 bit int, so we need to watch out for
    // integer overflow.
    int new_length = std::min<int>(list->max_length() + batch_size,
                                   kMaxDynamicFreeListLength);
    // The list's max_length must always be a multiple of batch_size,
    // and kMaxDynamicFreeListLength is not necessarily a multiple
    // of batch_size.
    new_length -= new_length % batch_size;
    ASSERT(new_length % batch_size == 0);
    list->set_max_length(new_length);
  }
  return batch[0];
}

ThreadCache的主要表现就是一个FreeList的链表,这和Go中的没啥本质不同。上面的代码可以看到,分配和回收其实就是一个FreeList链表的出入操作。
再看一下初始化和清理的过程:

void ThreadCache::Init(pthread_t tid) {
  size_ = 0;

  max_size_ = 0;
  IncreaseCacheLimitLocked();
  if (max_size_ == 0) {
    // There isn't enough memory to go around.  Just give the minimum to
    // this thread.
    max_size_ = kMinThreadCacheSize;

    // Take unclaimed_cache_space_ negative.
    unclaimed_cache_space_ -= kMinThreadCacheSize;
    ASSERT(unclaimed_cache_space_ < 0);
  }

  next_ = nullptr;
  prev_ = nullptr;
  tid_ = tid;
  in_setspecific_ = false;
  for (size_t size_class = 0; size_class < kNumClasses; ++size_class) {
    list_[size_class].Init();
  }
}

void ThreadCache::Cleanup() {
  // Put unused memory back into central cache
  for (int size_class = 0; size_class < kNumClasses; ++size_class) {
    if (list_[size_class].length() > 0) {
      ReleaseToCentralCache(&list_[size_class], size_class,
                            list_[size_class].length());
    }
  }
}
void ThreadCache::InitTSD() {
  ASSERT(!tsd_inited_);
  pthread_key_create(&heap_key_, DestroyThreadCache);
  tsd_inited_ = true;
}

在ThreadCache::InitTSD创建了私有的KEY。再看一下堆的创建:

ThreadCache* ThreadCache::CreateCacheIfNecessary() {
  // Initialize per-thread data if necessary
  tc_globals.InitIfNecessary();
  ThreadCache* heap = nullptr;

#ifdef ABSL_HAVE_TLS
  const bool maybe_reentrant = !tsd_inited_;
  // If we have set up our TLS, we can avoid a scan of the thread_heaps_ list.
  if (tsd_inited_) {
    if (thread_local_data_) {
      return thread_local_data_;
    }
  }
#else
  const bool maybe_reentrant = true;
#endif

  {
    absl::base_internal::SpinLockHolder h(&pageheap_lock);
    const pthread_t me = pthread_self();

    // This may be a recursive malloc call from pthread_setspecific()
    // In that case, the heap for this thread has already been created
    // and added to the linked list.  So we search for that first.
    if (maybe_reentrant) {
      for (ThreadCache* h = thread_heaps_; h != nullptr; h = h->next_) {
        if (h->tid_ == me) {
          heap = h;
          break;
        }
      }
    }

    if (heap == nullptr) {
      heap = NewHeap(me);
    }
  }

  // We call pthread_setspecific() outside the lock because it may
  // call malloc() recursively.  We check for the recursive call using
  // the "in_setspecific_" flag so that we can avoid calling
  // pthread_setspecific() if we are already inside pthread_setspecific().
  if (!heap->in_setspecific_ && tsd_inited_) {
    heap->in_setspecific_ = true;
#ifdef ABSL_HAVE_TLS
    // Also keep a copy in __thread for faster retrieval
    thread_local_data_ = heap;
#endif
    pthread_setspecific(heap_key_, heap);
    heap->in_setspecific_ = false;
  }
  return heap;
}
ThreadCache* ThreadCache::NewHeap(pthread_t tid) {
  // Create the heap and add it to the linked list
  ThreadCache* heap = tc_globals.threadcache_allocator().New();
  heap->Init(tid);
  heap->next_ = thread_heaps_;
  heap->prev_ = nullptr;
  if (thread_heaps_ != nullptr) {
    thread_heaps_->prev_ = heap;
  } else {
    // This is the only thread heap at the moment.
    ASSERT(next_memory_steal_ == nullptr);
    next_memory_steal_ = heap;
  }
  thread_heaps_ = heap;
  thread_heap_count_++;
  return heap;
}

void ThreadCache::BecomeIdle() {
  if (!tsd_inited_) return;  // No caches yet
  ThreadCache* heap = GetCacheIfPresent();
  if (heap == nullptr) return;        // No thread cache to remove
  if (heap->in_setspecific_) return;  // Do not disturb the active caller

  heap->in_setspecific_ = true;
  pthread_setspecific(heap_key_, nullptr);
#ifdef ABSL_HAVE_TLS
  // Also update the copy in __thread
  thread_local_data_ = nullptr;
#endif
  heap->in_setspecific_ = false;
  if (GetCacheIfPresent() == heap) {
    // Somehow heap got reinstated by a recursive call to malloc
    // from pthread_setspecific.  We give up in this case.
    return;
  }

  // We can now get rid of the heap
  DeleteCache(heap);
}

void ThreadCache::DestroyThreadCache(void* ptr) {
  // Note that "ptr" cannot be NULL since pthread promises not
  // to invoke the destructor on NULL values, but for safety,
  // we check anyway.
  if (ptr != nullptr) {
#ifdef ABSL_HAVE_TLS
    thread_local_data_ = nullptr;
#endif
    DeleteCache(reinterpret_cast<ThreadCache*>(ptr));
  }
}

void ThreadCache::DeleteCache(ThreadCache* heap) {
  // Remove all memory from heap
  heap->Cleanup();

  // Remove from linked list
  absl::base_internal::SpinLockHolder h(&pageheap_lock);
  if (heap->next_ != nullptr) heap->next_->prev_ = heap->prev_;
  if (heap->prev_ != nullptr) heap->prev_->next_ = heap->next_;
  if (thread_heaps_ == heap) thread_heaps_ = heap->next_;
  thread_heap_count_--;

  if (next_memory_steal_ == heap) next_memory_steal_ = heap->next_;
  if (next_memory_steal_ == nullptr) next_memory_steal_ = thread_heaps_;
  unclaimed_cache_space_ += heap->max_size_;

  tc_globals.threadcache_allocator().Delete(heap);
}

当把New等动作跳到调用的函数时,会发现它的流程和上一篇文章介绍的一样。这里只看一个系统Alloc:

void* Arena::Alloc(size_t bytes, int alignment) {
  ASSERT(alignment > 0);
  {  // First we need to move up to the correct alignment.
    const int misalignment =
        reinterpret_cast<uintptr_t>(free_area_) % alignment;
    const int alignment_bytes =
        misalignment != 0 ? alignment - misalignment : 0;
    free_area_ += alignment_bytes;
    free_avail_ -= alignment_bytes;
    bytes_allocated_ += alignment_bytes;
  }
  char* result;
  if (free_avail_ < bytes) {
    size_t ask = bytes > kAllocIncrement ? bytes : kAllocIncrement;
    // TODO(b/171081864): Arena allocations should be made relatively
    // infrequently.  Consider tagging this memory with sampled objects which
    // are also infrequently allocated.
    //
    // In the meantime it is important that we use the current NUMA partition
    // rather than always using a particular one because it's possible that any
    // single partition we choose might only contain nodes that the process is
    // unable to allocate from due to cgroup restrictions.
    MemoryTag tag;
    const auto& numa_topology = tc_globals.numa_topology();
    if (numa_topology.numa_aware()) {
      tag = NumaNormalTag(numa_topology.GetCurrentPartition());
    } else {
      tag = MemoryTag::kNormal;
    }

    auto [ptr, actual_size] = SystemAlloc(ask, kPageSize, tag);
    free_area_ = reinterpret_cast<char*>(ptr);
    if (ABSL_PREDICT_FALSE(free_area_ == nullptr)) {
      Crash(kCrash, __FILE__, __LINE__,
            "FATAL ERROR: Out of memory trying to allocate internal tcmalloc "
            "data (bytes, object-size); is something preventing mmap from "
            "succeeding (sandbox, VSS limitations)?",
            kAllocIncrement, bytes);
    }
    SystemBack(free_area_, actual_size);

    // We've discarded the previous free_area_, so any bytes that were
    // unallocated are effectively inaccessible to future allocations.
    bytes_unavailable_ += free_avail_;
    blocks_++;

    free_avail_ = actual_size;
  }

  ASSERT(reinterpret_cast<uintptr_t>(free_area_) % alignment == 0);
  result = free_area_;
  free_area_ += bytes;
  free_avail_ -= bytes;
  bytes_allocated_ += bytes;
  return reinterpret_cast<void*>(result);
}
AddressRange SystemAlloc(size_t bytes, size_t alignment, const MemoryTag tag) {
  // If default alignment is set request the minimum alignment provided by
  // the system.
  alignment = std::max(alignment, pagesize);

  // Discard requests that overflow
  if (bytes + alignment < bytes) return {nullptr, 0};

  absl::base_internal::SpinLockHolder lock_holder(&spinlock);

  InitSystemAllocatorIfNecessary();

  auto [result, actual_bytes] = region_manager->Alloc(bytes, alignment, tag);

  if (result != nullptr) {
    CheckAddressBits<kAddressBits>(reinterpret_cast<uintptr_t>(result) +
                                   actual_bytes - 1);
    ASSERT(GetMemoryTag(result) == tag);
  }
  return {result, actual_bytes};
}

所以说,看代码再和说明匹配,就啥都明白了,不过不明白的是,可能里面会有N多的人的有个性的写法,这就需要适应一下,然后再学为已用(当然是优秀的)。

3、从全局全配

这个是从CentralFreeList 分配:

using ShardedTransferCacheManager =
    ShardedTransferCacheManagerBase<StaticForwarder, ProdCpuLayout,
                                    BackingTransferCache>;

class TransferCacheManager : public StaticForwarder {
  template <typename CentralFreeList, typename Manager>
  friend class internal_transfer_cache::TransferCache;
  using TransferCache =
      internal_transfer_cache::TransferCache<tcmalloc_internal::CentralFreeList,
                                             TransferCacheManager>;

  template <typename CentralFreeList, typename Manager>
  friend class internal_transfer_cache::RingBufferTransferCache;
  using RingBufferTransferCache =
      internal_transfer_cache::RingBufferTransferCache<
          tcmalloc_internal::CentralFreeList, TransferCacheManager>;

  friend class FakeMultiClassRingBufferManager;
  friend class FakeMultiClassTransferCacheManager;

 public:
  constexpr TransferCacheManager() {}

  TransferCacheManager(const TransferCacheManager &) = delete;
  TransferCacheManager &operator=(const TransferCacheManager &) = delete;

  void Init() ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock) {
    implementation_ = ChooseImplementation();
    InitCaches();
  }

  void InsertRange(int size_class, absl::Span<void *> batch) {
    if (implementation_ == TransferCacheImplementation::Ring) {
      cache_[size_class].rbtc.InsertRange(size_class, batch);
    } else {
      cache_[size_class].tc.InsertRange(size_class, batch);
    }
  }

  ABSL_MUST_USE_RESULT int RemoveRange(int size_class, void **batch, int n) {
    if (implementation_ == TransferCacheImplementation::Ring) {
      return cache_[size_class].rbtc.RemoveRange(size_class, batch, n);
    } else {
      return cache_[size_class].tc.RemoveRange(size_class, batch, n);
    }
  }

  // This is not const because the underlying ring-buffer transfer cache
  // function requires acquiring a lock.
  size_t tc_length(int size_class) {
    if (implementation_ == TransferCacheImplementation::Ring) {
      return cache_[size_class].rbtc.tc_length();
    } else {
      return cache_[size_class].tc.tc_length();
    }
  }

  bool HasSpareCapacity(int size_class) {
    if (implementation_ == TransferCacheImplementation::Ring) {
      return cache_[size_class].rbtc.HasSpareCapacity(size_class);
    } else {
      return cache_[size_class].tc.HasSpareCapacity(size_class);
    }
  }

  TransferCacheStats GetStats(int size_class) {
    if (implementation_ == TransferCacheImplementation::Ring) {
      return cache_[size_class].rbtc.GetStats();
    } else {
      return cache_[size_class].tc.GetStats();
    }
  }

  CentralFreeList &central_freelist(int size_class) {
    if (implementation_ == TransferCacheImplementation::Ring) {
      return cache_[size_class].rbtc.freelist();
    } else {
      return cache_[size_class].tc.freelist();
    }
  }

  TransferCacheImplementation implementation() const { return implementation_; }

  bool CanIncreaseCapacity(int size_class) {
    if (implementation_ == TransferCacheImplementation::Ring) {
      return cache_[size_class].rbtc.CanIncreaseCapacity(size_class);
    } else {
      return cache_[size_class].tc.CanIncreaseCapacity(size_class);
    }
  }

  // We try to grow up to 10% of the total number of size classes during one
  // resize interval.
  static constexpr double kFractionClassesToResize = 0.1;
  static constexpr int kMaxSizeClassesToResize = std::max<int>(
      static_cast<int>(kNumClasses * kFractionClassesToResize), 1);

  // Tries to resize transfer caches based on the number of misses that they
  // incurred during the previous resize interval.
  void TryResizingCaches() {
    internal_transfer_cache::TryResizingCaches(*this);
  }

  static TransferCacheImplementation ChooseImplementation();

  void InitCaches() ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock) {
    for (int i = 0; i < kNumClasses; ++i) {
      if (implementation_ == TransferCacheImplementation::Ring) {
        new (&cache_[i].rbtc) RingBufferTransferCache(this, i);
      } else {
        new (&cache_[i].tc) TransferCache(this, i);
      }
    }
  }

  bool ShrinkCache(int size_class) {
    if (implementation_ == TransferCacheImplementation::Ring) {
      return cache_[size_class].rbtc.ShrinkCache(size_class);
    } else {
      return cache_[size_class].tc.ShrinkCache(size_class);
    }
  }

  bool IncreaseCacheCapacity(int size_class) {
    if (implementation_ == TransferCacheImplementation::Ring) {
      return cache_[size_class].rbtc.IncreaseCacheCapacity(size_class);
    } else {
      return cache_[size_class].tc.IncreaseCacheCapacity(size_class);
    }
  }

  size_t FetchCommitIntervalMisses(int size_class) {
    if (implementation_ == TransferCacheImplementation::Ring) {
      return cache_[size_class].rbtc.FetchCommitIntervalMisses();
    } else {
      return cache_[size_class].tc.FetchCommitIntervalMisses();
    }
  }

 private:
  TransferCacheImplementation implementation_ =
      TransferCacheImplementation::Legacy;
  union Cache {
    constexpr Cache() : dummy(false) {}
    ~Cache() {}

    TransferCache tc;
    RingBufferTransferCache rbtc;
    bool dummy;
  };
  Cache cache_[kNumClasses];
} ABSL_CACHELINE_ALIGNED;

#else

// For the small memory model, the transfer cache is not used.
class TransferCacheManager {
 public:
  constexpr TransferCacheManager() : freelist_() {}
  TransferCacheManager(const TransferCacheManager&) = delete;
  TransferCacheManager& operator=(const TransferCacheManager&) = delete;

  void Init() ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock) {
    for (int i = 0; i < kNumClasses; ++i) {
      freelist_[i].Init(i);
    }
  }

  void InsertRange(int size_class, absl::Span<void*> batch) {
    freelist_[size_class].InsertRange(batch);
  }

  ABSL_MUST_USE_RESULT int RemoveRange(int size_class, void** batch, int n) {
    return freelist_[size_class].RemoveRange(batch, n);
  }

  static constexpr size_t tc_length(int size_class) { return 0; }

  static constexpr TransferCacheStats GetStats(int size_class) { return {}; }

  const CentralFreeList& central_freelist(int size_class) const {
    return freelist_[size_class];
  }

  CentralFreeList& central_freelist(int size_class) {
    return freelist_[size_class];
  }

  TransferCacheImplementation implementation() const {
    return TransferCacheImplementation::None;
  }

 private:
  CentralFreeList freelist_[kNumClasses];
} 

其实看看名字就可以知道,一个缓存维持着这个链表,不断的回收释放的,删除被使用的。

4、span的分配
前面说过,最终的管理单元就是span,span和object是一种不同形式的表达。在前面看到的PageHeap的分配器返回的就是Span的指针:

Span* StaticForwarder::MapObjectToSpan(const void* object) {
  const PageId p = PageIdContaining(object);
  Span* span = tc_globals.pagemap().GetExistingDescriptor(p);
  return span;
}

Span* StaticForwarder::AllocateSpan(int size_class, size_t objects_per_span,
                                    Length pages_per_span) {
  const MemoryTag tag = MemoryTagFromSizeClass(size_class);
  Span* span =
      tc_globals.page_allocator().New(pages_per_span, objects_per_span, tag);
  if (ABSL_PREDICT_FALSE(span == nullptr)) {
    return nullptr;
  }
  ASSERT(tag == GetMemoryTag(span->start_address()));
  ASSERT(span->num_pages() == pages_per_span);

  tc_globals.pagemap().RegisterSizeClass(span, size_class);
  return span;
}

static void ReturnSpansToPageHeap(MemoryTag tag, absl::Span<Span*> free_spans,
                                  size_t objects_per_span)
    ABSL_LOCKS_EXCLUDED(pageheap_lock) {
  absl::base_internal::SpinLockHolder h(&pageheap_lock);
  for (Span* const free_span : free_spans) {
    ASSERT(tag == GetMemoryTag(free_span->start_address()));
    tc_globals.page_allocator().Delete(free_span, objects_per_span, tag);
  }
}

void StaticForwarder::DeallocateSpans(int size_class, size_t objects_per_span,
                                      absl::Span<Span*> free_spans) {
  // Unregister size class doesn't require holding any locks.
  for (Span* const free_span : free_spans) {
    ASSERT(IsNormalMemory(free_span->start_address()) ||
           IsColdMemory(free_span->start_address()));
    tc_globals.pagemap().UnregisterSizeClass(free_span);

    // Before taking pageheap_lock, prefetch the PageTrackers these spans are
    // on.
    //
    // Small-but-slow does not use the HugePageAwareAllocator (by default), so
    // do not prefetch on this config.
#ifndef TCMALLOC_SMALL_BUT_SLOW
    const PageId p = free_span->first_page();

    // In huge_page_filler.h, we static_assert that PageTracker's key elements
    // for deallocation are within the first two cachelines.
    void* pt = tc_globals.pagemap().GetHugepage(p);
    // Prefetch for writing, as we will issue stores to the PageTracker
    // instance.
    __builtin_prefetch(pt, 1, 3);
    __builtin_prefetch(reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(pt) +
                                               ABSL_CACHELINE_SIZE),
                       1, 3);
#endif  // TCMALLOC_SMALL_BUT_SLOW
  }

  const MemoryTag tag = MemoryTagFromSizeClass(size_class);
  ReturnSpansToPageHeap(tag, free_spans, objects_per_span);
}

前面说过每种Alloc都有自己的定义New,看一下PageHeap中调用Span的:

bool PageHeap::GrowHeap(Length n) {
  if (n > Length::max()) return false;
  auto [ptr, actual_size] = SystemAlloc(n.in_bytes(), kPageSize, tag_);
  if (ptr == nullptr) return false;
  n = BytesToLengthFloor(actual_size);

  stats_.system_bytes += actual_size;
  const PageId p = PageIdContaining(ptr);
  ASSERT(p > PageId{0});

  // If we have already a lot of pages allocated, just pre allocate a bunch of
  // memory for the page map. This prevents fragmentation by pagemap metadata
  // when a program keeps allocating and freeing large blocks.

  // Make sure pagemap has entries for all of the new pages.
  // Plus ensure one before and one after so coalescing code
  // does not need bounds-checking.
  if (pagemap_->Ensure(p - Length(1), n + Length(2))) {
    // Pretend the new area is allocated and then return it to cause
    // any necessary coalescing to occur.
    Span* span = Span::New(p, n);
    RecordSpan(span);
    span->set_location(Span::ON_RETURNED_FREELIST);
    MergeIntoFreeList(span);
    ASSERT(Check());
    return true;
  } else {
    // We could not allocate memory within the pagemap.
    // Note the following leaks virtual memory, but at least it gets rid of
    // the underlying physical memory.
    SystemRelease(ptr, actual_size);
    return false;
  }
}

// Why are these functions here? Because we want to inline them, but they
// need access to Static::span_allocator. Putting them in span.h would lead
// to nasty dependency loops.  Since anything that needs them certainly
// includes static_vars.h, this is a perfectly good compromise.
// TODO(b/134687001): move span_allocator to Span, getting rid of the need for
// this.
inline Span* Span::New(PageId p, Length len) {
  Span* result = Static::span_allocator().New();
  result->Init(p, len);
  return result;
}
  ABSL_ATTRIBUTE_RETURNS_NONNULL T* New()
      ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock) {
    // Consult free list
    T* result = free_list_;
    stats_.in_use++;
    if (ABSL_PREDICT_FALSE(result == nullptr)) {
      stats_.total++;
      return reinterpret_cast<T*>(arena_->Alloc(sizeof(T)));
    }
    free_list_ = *(reinterpret_cast<T**>(free_list_));
    return result;
  }

看到这里是不是有些熟悉,对前面的threadchache就是这么传递过来的。

5、mmap的内存管理

最后再看一下对底层内存的管理和封装,也就是RegionManager:

void InitSystemAllocatorIfNecessary() {
  if (region_factory) return;
  pagesize = getpagesize();
  // Sets the preferred alignment to be the largest of either the alignment
  // returned by mmap() or our minimum allocation size. The minimum allocation
  // size is usually a multiple of page size, but this need not be true for
  // SMALL_BUT_SLOW where we do not allocate in units of huge pages.
  preferred_alignment = std::max(pagesize, kMinSystemAlloc);
  region_manager = new (&region_manager_space) RegionManager();
  region_factory = new (&mmap_space) MmapRegionFactory();
}

std::pair<void*, size_t> RegionManager::Alloc(size_t request_size,
                                              size_t alignment,
                                              const MemoryTag tag) {
  constexpr uintptr_t kTagFree = uintptr_t{1} << kTagShift;

  // We do not support size or alignment larger than kTagFree.
  // TODO(b/141325493): Handle these large allocations.
  if (request_size > kTagFree || alignment > kTagFree) return {nullptr, 0};

  // If we are dealing with large sizes, or large alignments we do not
  // want to throw away the existing reserved region, so instead we
  // return a new region specifically targeted for the request.
  if (request_size > kMinMmapAlloc || alignment > kMinMmapAlloc) {
    // Align on kMinSystemAlloc boundaries to reduce external fragmentation for
    // future allocations.
    size_t size = RoundUp(request_size, kMinSystemAlloc);
    if (size < request_size) return {nullptr, 0};
    alignment = std::max(alignment, preferred_alignment);
    void* ptr = MmapAligned(size, alignment, tag);
    if (!ptr) return {nullptr, 0};

    const auto region_type = TagToHint(tag);
    AddressRegion* region = region_factory->Create(ptr, size, region_type);
    if (!region) {
      munmap(ptr, size);
      return {nullptr, 0};
    }
    std::pair<void*, size_t> result = region->Alloc(size, alignment);
    if (result.first != nullptr) {
      ASSERT(result.first == ptr);
      ASSERT(result.second == size);
    } else {
      ASSERT(result.second == 0);
    }
    return result;
  }
  return Allocate(request_size, alignment, tag);
}

std::pair<void*, size_t> RegionManager::Allocate(size_t size, size_t alignment,
                                                 const MemoryTag tag) {
  AddressRegion*& region = *[&]() {
    switch (tag) {
      case MemoryTag::kNormal:
        return &normal_region_[0];
      case MemoryTag::kNormalP1:
        return &normal_region_[1];
      case MemoryTag::kSampled:
        return &sampled_region_;
      case MemoryTag::kCold:
        return &cold_region_;
      default:
        ASSUME(false);
        __builtin_unreachable();
    }
  }();
  // For sizes that fit in our reserved range first of all check if we can
  // satisfy the request from what we have available.
  if (region) {
    std::pair<void*, size_t> result = region->Alloc(size, alignment);
    if (result.first) return result;
  }

  // Allocation failed so we need to reserve more memory.
  // Reserve new region and try allocation again.
  void* ptr = MmapAligned(kMinMmapAlloc, kMinMmapAlloc, tag);
  if (!ptr) return {nullptr, 0};

  const auto region_type = TagToHint(tag);
  region = region_factory->Create(ptr, kMinMmapAlloc, region_type);
  if (!region) {
    munmap(ptr, kMinMmapAlloc);
    return {nullptr, 0};
  }
  return region->Alloc(size, alignment);
}

system-alloc.cc中定义几个基础的抽象数据结构,除了有RegionManage还有MmapRegionFactory和MmapRegion,它们都非常重要,可以重点看看他们的用法。这个Alloc和Allocate要分清楚,前者是对外开放的,后者是一个私有的内存管理函数。new这个操作运算符会直接调用相关的Alloc,明白这一点就明白了整个的分配过程。再次说明一次。
在上面的函数中,都调用了一个函数:

void* MmapAligned(size_t size, size_t alignment, const MemoryTag tag) {
  ASSERT(size <= kTagMask);
  ASSERT(alignment <= kTagMask);

  static uintptr_t next_sampled_addr = 0;
  static std::array<uintptr_t, kNumaPartitions> next_normal_addr = {0};
  static uintptr_t next_cold_addr = 0;

  absl::optional<int> numa_partition;
  uintptr_t& next_addr = *[&]() {
    switch (tag) {
      case MemoryTag::kSampled:
        return &next_sampled_addr;
      case MemoryTag::kNormalP0:
        numa_partition = 0;
        return &next_normal_addr[0];
      case MemoryTag::kNormalP1:
        numa_partition = 1;
        return &next_normal_addr[1];
      case MemoryTag::kCold:
        return &next_cold_addr;
      default:
        ASSUME(false);
        __builtin_unreachable();
    }
  }();

  if (!next_addr || next_addr & (alignment - 1) ||
      GetMemoryTag(reinterpret_cast<void*>(next_addr)) != tag ||
      GetMemoryTag(reinterpret_cast<void*>(next_addr + size - 1)) != tag) {
    next_addr = RandomMmapHint(size, alignment, tag);
  }
  void* hint;
  for (int i = 0; i < 1000; ++i) {
    hint = reinterpret_cast<void*>(next_addr);
    ASSERT(GetMemoryTag(hint) == tag);
    // TODO(b/140190055): Use MAP_FIXED_NOREPLACE once available.
    void* result =
        mmap(hint, size, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
    if (result == hint) {
      if (numa_partition.has_value()) {
        BindMemory(result, size, *numa_partition);
      }
      // Attempt to keep the next mmap contiguous in the common case.
      next_addr += size;
      CHECK_CONDITION(kAddressBits == std::numeric_limits<uintptr_t>::digits ||
                      next_addr <= uintptr_t{1} << kAddressBits);

      ASSERT((reinterpret_cast<uintptr_t>(result) & (alignment - 1)) == 0);
      return result;
    }
    if (result == MAP_FAILED) {
      Log(kLogWithStack, __FILE__, __LINE__,
          "mmap() reservation failed (hint, size, error)", hint, size,
          strerror(errno));
      return nullptr;
    }
    if (int err = munmap(result, size)) {
      Log(kLogWithStack, __FILE__, __LINE__, "munmap() failed");
      ASSERT(err == 0);
    }
    next_addr = RandomMmapHint(size, alignment, tag);
  }

  Log(kLogWithStack, __FILE__, __LINE__,
      "MmapAligned() failed - unable to allocate with tag (hint, size, "
      "alignment) - is something limiting address placement?",
      hint, size, alignment);
  return nullptr;
}

看到mmap()这个函数了么,明白了吧,再往下就如同普通的分配一样了,不再赘述。

四、回收流程源码

回收的流程其实是上面就看到了,包括底层对new,delete,对malloc和free的重定义都在libc_override_redefine.h:

extern "C" {
void* malloc(size_t s) { return TCMallocInternalMalloc(s); }
void* calloc(size_t n, size_t s) { return TCMallocInternalCalloc(n, s); }
void* realloc(void* p, size_t s) { return TCMallocInternalRealloc(p, s); }
void free(void* p) { TCMallocInternalFree(p); }
void* memalign(size_t a, size_t s) { return TCMallocInternalMemalign(a, s); }
int posix_memalign(void** r, size_t a, size_t s) {
  return TCMallocInternalPosixMemalign(r, a, s);
}
size_t malloc_usable_size(void* p) { return TCMallocInternalMallocSize(p); }
void* aligned_alloc(size_t a, size_t s) {
  return TCMallocInternalAlignedAlloc(a, s);
}

// tcmalloc extension
void sdallocx(void* p, size_t s, int flags) noexcept {
  TCMallocInternalSdallocx(p, s, flags);
}

#if defined(__GLIBC__) || defined(__NEWLIB__)
// SunOS extension
void cfree(void* p) { TCMallocInternalCfree(p); }
#endif

#if defined(OS_MACOSX) || defined(__BIONIC__) || defined(__GLIBC__) || \
    defined(__NEWLIB__) || defined(__UCLIBC__)
// Obsolete memalign
void* valloc(size_t s) { return TCMallocInternalValloc(s); }
#endif

#if defined(__BIONIC__) || defined(__GLIBC__) || defined(__NEWLIB__)
// Obsolete memalign
void* pvalloc(size_t s) { return TCMallocInternalPvalloc(s); }
#endif

#if defined(__GLIBC__) || defined(__NEWLIB__) || defined(__UCLIBC__)
void malloc_stats(void) { TCMallocInternalMallocStats(); }
#endif

#ifdef TCMALLOC_HAVE_MALLOC_TRIM
int malloc_trim(size_t pad) { return TCMallocInternalMallocTrim(pad); }
#endif

#if defined(__BIONIC__) || defined(__GLIBC__) || defined(__NEWLIB__) || \
    defined(__UCLIBC__)
int mallopt(int cmd, int v) { return TCMallocInternalMallOpt(cmd, v); }
#endif

#ifdef TCMALLOC_HAVE_STRUCT_MALLINFO
struct mallinfo mallinfo(void) {
  return TCMallocInternalMallocInfo();
}
#endif

#if defined(__GLIBC__)
size_t malloc_size(void* p) { return TCMallocInternalMallocSize(p); }
#endif
}  // extern "C"

这其实就意味着上层的抽象只是管理着这些代码的应用逻辑,底层会使用tcmalloc自己的相关内存分配回收API。而在每个抽象的管理类中,相关的Delete或者Destory之类字眼的函数,都是用来回收的,相关机制在前面已经提过,这里不再重复。

五、总结

自己想写或者有过写内存池经验的小伙伴,可以看看人家怎么写的,和自己写的比一比,不要担心自己写的LOW,大家都是从LOW起步的,知道人家的优势,看到自己的劣势,更要看到自己的长处,在思想是不是和优秀的源码有想通之处。既要清醒的看到不足,又要正视现实的学习别人的长处,看到别人的不足和适用场景。要有一种自信,只要给你时间和条件,你写能写出一种优秀的内存管理框架来。
敢为人之所不敢为,方为勇!

 类似资料: