当前位置: 首页 > 工具软件 > Arrow > 使用案例 >

Arrow 之 list

袁子瑜
2023-12-01


arrow/cpp/src/arrow/type.cc

 std::shared_ptr<DataType> list(const std::shared_ptr<DataType>& value_type) {
   return std::make_shared<ListType>(value_type);
 }

ListType

arrow/cpp/src/arrow/type.h

/// \brief Concrete type class for list data
///
/// List data is nested data where each value is a variable number of
/// child items.  Lists can be recursively nested, for example
/// list(list(int32)).
class ARROW_EXPORT ListType : public BaseListType {
 public:
  static constexpr Type::type type_id = Type::LIST;
  using offset_type = int32_t;

  static constexpr const char* type_name() { return "list"; }

  // List can contain any other logical value type
  explicit ListType(const std::shared_ptr<DataType>& value_type)
      : ListType(std::make_shared<Field>("item", value_type)) {}

  explicit ListType(const std::shared_ptr<Field>& value_field) : BaseListType(type_id) {
    children_ = {value_field};
  }

  DataTypeLayout layout() const override {
    return DataTypeLayout(
        {DataTypeLayout::Bitmap(), DataTypeLayout::FixedWidth(sizeof(offset_type))});
  }

  std::string ToString() const override;

  std::string name() const override { return "list"; }

 protected:
  std::string ComputeFingerprint() const override;
};
LIST

static constexpr Type::type type_id = Type::LIST;

/// \brief Main data type enumeration
  ///
  /// This enumeration provides a quick way to interrogate the category
  /// of a DataType instance.
  enum type {
/// A list of some logical data type
    LIST,
  }
ListType 之 ToString()

arrow/cpp/src/arrow/type.cc
list<item: list<item: int8>>
在这里实现了递归

std::string ListType::ToString() const {
  std::stringstream s;
  s << "list<" << value_field()->ToString() << ">";
  return s.str();
}

arrow/cpp/src/arrow/type.h

/// \brief Base class for all variable-size list data types
class ARROW_EXPORT BaseListType : public NestedType {
 public:
  using NestedType::NestedType;
  std::shared_ptr<Field> value_field() const { return children_[0]; }

  std::shared_ptr<DataType> value_type() const { return children_[0]->type(); }
};

arrow/cpp/src/arrow/type.h
“item”

// List can contain any other logical value type
  explicit ListType(const std::shared_ptr<DataType>& value_type)
      : ListType(std::make_shared<Field>("item", value_type)) {}

BaseListBuilder

arrow/cpp/src/arrow/array/builder_nested.h

// ----------------------------------------------------------------------
// List builder

template <typename TYPE>
class BaseListBuilder : public ArrayBuilder {
 public:
  using TypeClass = TYPE;
  using offset_type = typename TypeClass::offset_type;

  /// Use this constructor to incrementally build the value array along with offsets and
  /// null bitmap.
  BaseListBuilder(MemoryPool* pool, std::shared_ptr<ArrayBuilder> const& value_builder,
                  const std::shared_ptr<DataType>& type)
      : ArrayBuilder(pool),
        offsets_builder_(pool),
        value_builder_(value_builder),
        value_field_(type->field(0)->WithType(NULLPTR)) {}

  BaseListBuilder(MemoryPool* pool, std::shared_ptr<ArrayBuilder> const& value_builder)
      : BaseListBuilder(pool, value_builder, list(value_builder->type())) {}

  Status Resize(int64_t capacity) override {
    if (capacity > maximum_elements()) {
      return Status::CapacityError("List array cannot reserve space for more than ",
                                   maximum_elements(), " got ", capacity);
    }
    ARROW_RETURN_NOT_OK(CheckCapacity(capacity));

    // One more than requested for offsets
    ARROW_RETURN_NOT_OK(offsets_builder_.Resize(capacity + 1));
    return ArrayBuilder::Resize(capacity);
  }

  void Reset() override {
    ArrayBuilder::Reset();
    offsets_builder_.Reset();
    value_builder_->Reset();
  }

  /// \brief Vector append
  ///
  /// If passed, valid_bytes is of equal length to values, and any zero byte
  /// will be considered as a null for that slot
  Status AppendValues(const offset_type* offsets, int64_t length,
                      const uint8_t* valid_bytes = NULLPTR) {
    ARROW_RETURN_NOT_OK(Reserve(length));
    UnsafeAppendToBitmap(valid_bytes, length);
    offsets_builder_.UnsafeAppend(offsets, length);
    return Status::OK();
  }

  /// \brief Start a new variable-length list slot
  ///
  /// This function should be called before beginning to append elements to the
  /// value builder
  Status Append(bool is_valid = true) {
    ARROW_RETURN_NOT_OK(Reserve(1));
    UnsafeAppendToBitmap(is_valid);
    return AppendNextOffset();
  }

  Status AppendNull() final { return Append(false); }

  Status AppendNulls(int64_t length) final {
    ARROW_RETURN_NOT_OK(Reserve(length));
    ARROW_RETURN_NOT_OK(ValidateOverflow(0));
    UnsafeAppendToBitmap(length, false);
    const int64_t num_values = value_builder_->length();
    for (int64_t i = 0; i < length; ++i) {
      offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_values));
    }
    return Status::OK();
  }

  Status AppendEmptyValue() final { return Append(true); }

  Status AppendEmptyValues(int64_t length) final {
    ARROW_RETURN_NOT_OK(Reserve(length));
    ARROW_RETURN_NOT_OK(ValidateOverflow(0));
    UnsafeAppendToBitmap(length, true);
    const int64_t num_values = value_builder_->length();
    for (int64_t i = 0; i < length; ++i) {
      offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_values));
    }
    return Status::OK();
  }

  Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
    ARROW_RETURN_NOT_OK(AppendNextOffset());

    // Offset padding zeroed by BufferBuilder
    std::shared_ptr<Buffer> offsets, null_bitmap;
    ARROW_RETURN_NOT_OK(offsets_builder_.Finish(&offsets));
    ARROW_RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));

    if (value_builder_->length() == 0) {
      // Try to make sure we get a non-null values buffer (ARROW-2744)
      ARROW_RETURN_NOT_OK(value_builder_->Resize(0));
    }

    std::shared_ptr<ArrayData> items;
    ARROW_RETURN_NOT_OK(value_builder_->FinishInternal(&items));

    *out = ArrayData::Make(type(), length_, {null_bitmap, offsets}, {std::move(items)},
                           null_count_);
    Reset();
    return Status::OK();
  }

  Status ValidateOverflow(int64_t new_elements) const {
    auto new_length = value_builder_->length() + new_elements;
    if (ARROW_PREDICT_FALSE(new_length > maximum_elements())) {
      return Status::CapacityError("List array cannot contain more than ",
                                   maximum_elements(), " elements, have ", new_elements);
    } else {
      return Status::OK();
    }
  }

  ArrayBuilder* value_builder() const { return value_builder_.get(); }

  // Cannot make this a static attribute because of linking issues
  static constexpr int64_t maximum_elements() {
    return std::numeric_limits<offset_type>::max() - 1;
  }

  std::shared_ptr<DataType> type() const override {
    return std::make_shared<TYPE>(value_field_->WithType(value_builder_->type()));
  }

 protected:
  TypedBufferBuilder<offset_type> offsets_builder_;
  std::shared_ptr<ArrayBuilder> value_builder_;
  std::shared_ptr<Field> value_field_;

  Status AppendNextOffset() {
    ARROW_RETURN_NOT_OK(ValidateOverflow(0));
    const int64_t num_values = value_builder_->length();
    return offsets_builder_.Append(static_cast<offset_type>(num_values));
  }
};

Decimal128Builder

class ARROW_EXPORT Decimal128Builder : public FixedSizeBinaryBuilder {
 public:
  using TypeClass = Decimal128Type;

  explicit Decimal128Builder(const std::shared_ptr<DataType>& type,
                             MemoryPool* pool = default_memory_pool());

  using FixedSizeBinaryBuilder::Append;
  using FixedSizeBinaryBuilder::AppendValues;
  using FixedSizeBinaryBuilder::Reset;

  Status Append(Decimal128 val);
  void UnsafeAppend(Decimal128 val);
  void UnsafeAppend(util::string_view val);

  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;

  /// \cond FALSE
  using ArrayBuilder::Finish;
  /// \endcond

  Status Finish(std::shared_ptr<Decimal128Array>* out) { return FinishTyped(out); }

  std::shared_ptr<DataType> type() const override { return decimal_type_; }

 protected:
  std::shared_ptr<Decimal128Type> decimal_type_;
};

FixedSizeBinaryBuilder

class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder {
 public:
  using TypeClass = FixedSizeBinaryType;

  explicit FixedSizeBinaryBuilder(const std::shared_ptr<DataType>& type,
                                  MemoryPool* pool = default_memory_pool());

  Status Append(const uint8_t* value) {
    ARROW_RETURN_NOT_OK(Reserve(1));
    UnsafeAppend(value);
    return Status::OK();
  }

  Status Append(const char* value) {
    return Append(reinterpret_cast<const uint8_t*>(value));
  }

  Status Append(const util::string_view& view) {
    ARROW_RETURN_NOT_OK(Reserve(1));
    UnsafeAppend(view);
    return Status::OK();
  }

  Status Append(const std::string& s) {
    ARROW_RETURN_NOT_OK(Reserve(1));
    UnsafeAppend(s);
    return Status::OK();
  }

  template <size_t NBYTES>
  Status Append(const std::array<uint8_t, NBYTES>& value) {
    ARROW_RETURN_NOT_OK(Reserve(1));
    UnsafeAppend(
        util::string_view(reinterpret_cast<const char*>(value.data()), value.size()));
    return Status::OK();
  }

  Status AppendValues(const uint8_t* data, int64_t length,
                      const uint8_t* valid_bytes = NULLPTR);

  Status AppendNull() final;
  Status AppendNulls(int64_t length) final;

  Status AppendEmptyValue() final;
  Status AppendEmptyValues(int64_t length) final;

  void UnsafeAppend(const uint8_t* value) {
    UnsafeAppendToBitmap(true);
    if (ARROW_PREDICT_TRUE(byte_width_ > 0)) {
      byte_builder_.UnsafeAppend(value, byte_width_);
    }
  }

  void UnsafeAppend(const char* value) {
    UnsafeAppend(reinterpret_cast<const uint8_t*>(value));
  }

  void UnsafeAppend(util::string_view value) {
...
...

BaseBinaryBuilder

// ----------------------------------------------------------------------
// Binary and String

template <typename TYPE>
class BaseBinaryBuilder : public ArrayBuilder {
 public:
  using TypeClass = TYPE;
  using offset_type = typename TypeClass::offset_type;

  explicit BaseBinaryBuilder(MemoryPool* pool = default_memory_pool())
      : ArrayBuilder(pool), offsets_builder_(pool), value_data_builder_(pool) {}

  BaseBinaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool)
      : BaseBinaryBuilder(pool) {}

  Status Append(const uint8_t* value, offset_type length) {
    ARROW_RETURN_NOT_OK(Reserve(1));
    ARROW_RETURN_NOT_OK(AppendNextOffset());
    // Safety check for UBSAN.
    if (ARROW_PREDICT_TRUE(length > 0)) {
      ARROW_RETURN_NOT_OK(ValidateOverflow(length));
      ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length));
    }

    UnsafeAppendToBitmap(true);
    return Status::OK();
  }

  Status Append(const char* value, offset_type length) {
    return Append(reinterpret_cast<const uint8_t*>(value), length);
  }

  Status Append(util::string_view value) {
    return Append(value.data(), static_cast<offset_type>(value.size()));
  }

  Status AppendNulls(int64_t length) final {
    const int64_t num_bytes = value_data_builder_.length();
    ARROW_RETURN_NOT_OK(Reserve(length));
    for (int64_t i = 0; i < length; ++i) {
      offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
    }
    UnsafeAppendToBitmap(length, false);
    return Status::OK();
  }

  Status AppendNull() final {
    ARROW_RETURN_NOT_OK(AppendNextOffset());
    ARROW_RETURN_NOT_OK(Reserve(1));
    UnsafeAppendToBitmap(false);
    return Status::OK();
  }

  Status AppendEmptyValue() final {
    ARROW_RETURN_NOT_OK(AppendNextOffset());
    ARROW_RETURN_NOT_OK(Reserve(1));
    UnsafeAppendToBitmap(true);
    return Status::OK();
  }

  Status AppendEmptyValues(int64_t length) final {
    const int64_t num_bytes = value_data_builder_.length();
    ARROW_RETURN_NOT_OK(Reserve(length));
    for (int64_t i = 0; i < length; ++i) {
      offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
    }
    UnsafeAppendToBitmap(length, true);
    return Status::OK();
  }

  /// \brief Append without checking capacity
  ///
  /// Offsets and data should have been presized using Reserve() and
  /// ReserveData(), respectively.
  void UnsafeAppend(const uint8_t* value, offset_type length) {
    UnsafeAppendNextOffset();
    value_data_builder_.UnsafeAppend(value, length);
    UnsafeAppendToBitmap(true);
  }

  void UnsafeAppend(const char* value, offset_type length) {
    UnsafeAppend(reinterpret_cast<const uint8_t*>(value), length);
  }

  void UnsafeAppend(const std::string& value) {
    UnsafeAppend(value.c_str(), static_cast<offset_type>(value.size()));
  }

  void UnsafeAppend(util::string_view value) {
    UnsafeAppend(value.data(), static_cast<offset_type>(value.size()));
  }

  void UnsafeAppendNull() {
    const int64_t num_bytes = value_data_builder_.length();
    offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
    UnsafeAppendToBitmap(false);
  }

  /// \brief Append a sequence of strings in one shot.
  ///
  /// \param[in] values a vector of strings
  /// \param[in] valid_bytes an optional sequence of bytes where non-zero
  /// indicates a valid (non-null) value
  /// \return Status
  Status AppendValues(const std::vector<std::string>& values,
                      const uint8_t* valid_bytes = NULLPTR) {
    std::size_t total_length = std::accumulate(
        values.begin(), values.end(), 0ULL,
        [](uint64_t sum, const std::string& str) { return sum + str.size(); });
    ARROW_RETURN_NOT_OK(Reserve(values.size()));
    ARROW_RETURN_NOT_OK(value_data_builder_.Reserve(total_length));
    ARROW_RETURN_NOT_OK(offsets_builder_.Reserve(values.size()));

    if (valid_bytes != NULLPTR) {
      for (std::size_t i = 0; i < values.size(); ++i) {
        UnsafeAppendNextOffset();
        if (valid_bytes[i]) {
          value_data_builder_.UnsafeAppend(
              reinterpret_cast<const uint8_t*>(values[i].data()), values[i].size());
        }
      }
    } else {
      for (std::size_t i = 0; i < values.size(); ++i) {
        UnsafeAppendNextOffset();
        value_data_builder_.UnsafeAppend(
            reinterpret_cast<const uint8_t*>(values[i].data()), values[i].size());
      }
    }

    UnsafeAppendToBitmap(valid_bytes, values.size());
    return Status::OK();
  }

  /// \brief Append a sequence of nul-terminated strings in one shot.
  ///        If one of the values is NULL, it is processed as a null
  ///        value even if the corresponding valid_bytes entry is 1.
  ///
  /// \param[in] values a contiguous C array of nul-terminated char *
  /// \param[in] length the number of values to append
  /// \param[in] valid_bytes an optional sequence of bytes where non-zero
  /// indicates a valid (non-null) value
  /// \return Status
  Status AppendValues(const char** values, int64_t length,
                      const uint8_t* valid_bytes = NULLPTR) {
    std::size_t total_length = 0;
    std::vector<std::size_t> value_lengths(length);
    bool have_null_value = false;
    for (int64_t i = 0; i < length; ++i) {
      if (values[i] != NULLPTR) {
        auto value_length = strlen(values[i]);
        value_lengths[i] = value_length;
        total_length += value_length;
      } else {
        have_null_value = true;
      }
    }
    ARROW_RETURN_NOT_OK(Reserve(length));
    ARROW_RETURN_NOT_OK(ReserveData(total_length));

    if (valid_bytes) {
      int64_t valid_bytes_offset = 0;
      for (int64_t i = 0; i < length; ++i) {
        UnsafeAppendNextOffset();
        if (valid_bytes[i]) {
          if (values[i]) {
            value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]),
                                             value_lengths[i]);
          } else {
            UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset,
                                 i - valid_bytes_offset);
            UnsafeAppendToBitmap(false);
            valid_bytes_offset = i + 1;
          }
        }
      }
      UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset, length - valid_bytes_offset);
    } else {
      if (have_null_value) {
        std::vector<uint8_t> valid_vector(length, 0);
        for (int64_t i = 0; i < length; ++i) {
          UnsafeAppendNextOffset();
          if (values[i]) {
            value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]),
                                             value_lengths[i]);
            valid_vector[i] = 1;
          }
        }
        UnsafeAppendToBitmap(valid_vector.data(), length);
      } else {
        for (int64_t i = 0; i < length; ++i) {
          UnsafeAppendNextOffset();
          value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]),
                                           value_lengths[i]);
        }
        UnsafeAppendToBitmap(NULLPTR, length);
      }
    }
    return Status::OK();
  }

  void Reset() override {
    ArrayBuilder::Reset();
    offsets_builder_.Reset();
    value_data_builder_.Reset();
  }

  Status ValidateOverflow(int64_t new_bytes) {
    auto new_size = value_data_builder_.length() + new_bytes;
    if (ARROW_PREDICT_FALSE(new_size > memory_limit())) {
      return Status::CapacityError("array cannot contain more than ", memory_limit(),
                                   " bytes, have ", new_size);
    } else {
      return Status::OK();
    }
  }

  Status Resize(int64_t capacity) override {
    // XXX Why is this check necessary?  There is no reason to disallow, say,
    // binary arrays with more than 2**31 empty or null values.
    if (capacity > memory_limit()) {
      return Status::CapacityError("BinaryBuilder cannot reserve space for more than ",
                                   memory_limit(), " child elements, got ", capacity);
    }
    ARROW_RETURN_NOT_OK(CheckCapacity(capacity));

    // One more than requested for offsets
    ARROW_RETURN_NOT_OK(offsets_builder_.Resize(capacity + 1));
    return ArrayBuilder::Resize(capacity);
  }

  /// \brief Ensures there is enough allocated capacity to append the indicated
  /// number of bytes to the value data buffer without additional allocations
  Status ReserveData(int64_t elements) {
    ARROW_RETURN_NOT_OK(ValidateOverflow(elements));
    return value_data_builder_.Reserve(elements);
  }

  Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
    // Write final offset (values length)
    ARROW_RETURN_NOT_OK(AppendNextOffset());

    // These buffers' padding zeroed by BufferBuilder
    std::shared_ptr<Buffer> offsets, value_data, null_bitmap;
    ARROW_RETURN_NOT_OK(offsets_builder_.Finish(&offsets));
    ARROW_RETURN_NOT_OK(value_data_builder_.Finish(&value_data));
    ARROW_RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));

    *out = ArrayData::Make(type(), length_, {null_bitmap, offsets, value_data},
                           null_count_, 0);
    Reset();
    return Status::OK();
  }

  /// \return data pointer of the value date builder
  const uint8_t* value_data() const { return value_data_builder_.data(); }
  /// \return size of values buffer so far
  int64_t value_data_length() const { return value_data_builder_.length(); }
  /// \return capacity of values buffer
  int64_t value_data_capacity() const { return value_data_builder_.capacity(); }

  /// \return data pointer of the value date builder
  const offset_type* offsets_data() const { return offsets_builder_.data(); }

  /// Temporary access to a value.
  ///
  /// This pointer becomes invalid on the next modifying operation.
  const uint8_t* GetValue(int64_t i, offset_type* out_length) const {
    const offset_type* offsets = offsets_builder_.data();
    const auto offset = offsets[i];
    if (i == (length_ - 1)) {
      *out_length = static_cast<offset_type>(value_data_builder_.length()) - offset;
    } else {
      *out_length = offsets[i + 1] - offset;
    }
    return value_data_builder_.data() + offset;
  }

  offset_type offset(int64_t i) const { return offsets_data()[i]; }

  /// Temporary access to a value.
  ///
  /// This view becomes invalid on the next modifying operation.
  util::string_view GetView(int64_t i) const {
    offset_type value_length;
    const uint8_t* value_data = GetValue(i, &value_length);
    return util::string_view(reinterpret_cast<const char*>(value_data), value_length);
  }

  // Cannot make this a static attribute because of linking issues
  static constexpr int64_t memory_limit() {
    return std::numeric_limits<offset_type>::max() - 1;
  }

 protected:
  TypedBufferBuilder<offset_type> offsets_builder_;
  TypedBufferBuilder<uint8_t> value_data_builder_;

  Status AppendNextOffset() {
    const int64_t num_bytes = value_data_builder_.length();
    return offsets_builder_.Append(static_cast<offset_type>(num_bytes));
  }

  void UnsafeAppendNextOffset() {
    const int64_t num_bytes = value_data_builder_.length();
    offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
  }
};

NumericBuilder

/// Base class for all Builders that emit an Array of a scalar numerical type.
template <typename T>
class NumericBuilder : public ArrayBuilder {
 public:
  using TypeClass = T;
  using value_type = typename T::c_type;
  using ArrayType = typename TypeTraits<T>::ArrayType;

  template <typename T1 = T>
  explicit NumericBuilder(
      enable_if_parameter_free<T1, MemoryPool*> pool = default_memory_pool())
      : ArrayBuilder(pool), type_(TypeTraits<T>::type_singleton()), data_builder_(pool) {}

  NumericBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool)
      : ArrayBuilder(pool), type_(type), data_builder_(pool) {}

  /// Append a single scalar and increase the size if necessary.
  Status Append(const value_type val) {
    ARROW_RETURN_NOT_OK(ArrayBuilder::Reserve(1));
    UnsafeAppend(val);
    return Status::OK();
  }

  /// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory
  /// The memory at the corresponding data slot is set to 0 to prevent
  /// uninitialized memory access
  Status AppendNulls(int64_t length) final {
    ARROW_RETURN_NOT_OK(Reserve(length));
    data_builder_.UnsafeAppend(length, value_type{});  // zero
    UnsafeSetNull(length);
    return Status::OK();
  }

  /// \brief Append a single null element
  Status AppendNull() final {
    ARROW_RETURN_NOT_OK(Reserve(1));
    data_builder_.UnsafeAppend(value_type{});  // zero
    UnsafeAppendToBitmap(false);
    return Status::OK();
  }

  /// \brief Append a empty element
  Status AppendEmptyValue() final {
    ARROW_RETURN_NOT_OK(Reserve(1));
    data_builder_.UnsafeAppend(value_type{});  // zero
    UnsafeAppendToBitmap(true);
    return Status::OK();
  }

  /// \brief Append several empty elements
  Status AppendEmptyValues(int64_t length) final {
    ARROW_RETURN_NOT_OK(Reserve(length));
    data_builder_.UnsafeAppend(length, value_type{});  // zero
    UnsafeSetNotNull(length);
    return Status::OK();
  }

  value_type GetValue(int64_t index) const { return data_builder_.data()[index]; }

  void Reset() override { data_builder_.Reset(); }

  Status Resize(int64_t capacity) override {
    ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
    capacity = std::max(capacity, kMinBuilderCapacity);
    ARROW_RETURN_NOT_OK(data_builder_.Resize(capacity));
    return ArrayBuilder::Resize(capacity);
  }

  value_type operator[](int64_t index) const { return GetValue(index); }

  value_type& operator[](int64_t index) {
    return reinterpret_cast<value_type*>(data_builder_.mutable_data())[index];
  }

  /// \brief Append a sequence of elements in one shot
  /// \param[in] values a contiguous C array of values
  /// \param[in] length the number of values to append
  /// \param[in] valid_bytes an optional sequence of bytes where non-zero
  /// indicates a valid (non-null) value
  /// \return Status
  Status AppendValues(const value_type* values, int64_t length,
                      const uint8_t* valid_bytes = NULLPTR) {
    ARROW_RETURN_NOT_OK(Reserve(length));
    data_builder_.UnsafeAppend(values, length);
    // length_ is update by these
    ArrayBuilder::UnsafeAppendToBitmap(valid_bytes, length);
    return Status::OK();
  }

  /// \brief Append a sequence of elements in one shot
  /// \param[in] values a contiguous C array of values
  /// \param[in] length the number of values to append
  /// \param[in] is_valid an std::vector<bool> indicating valid (1) or null
  /// (0). Equal in length to values
  /// \return Status
  Status AppendValues(const value_type* values, int64_t length,
                      const std::vector<bool>& is_valid) {
    ARROW_RETURN_NOT_OK(Reserve(length));
    data_builder_.UnsafeAppend(values, length);
    // length_ is update by these
    ArrayBuilder::UnsafeAppendToBitmap(is_valid);
    return Status::OK();
  }

  /// \brief Append a sequence of elements in one shot
  /// \param[in] values a std::vector of values
  /// \param[in] is_valid an std::vector<bool> indicating valid (1) or null
  /// (0). Equal in length to values
  /// \return Status
  Status AppendValues(const std::vector<value_type>& values,
                      const std::vector<bool>& is_valid) {
    return AppendValues(values.data(), static_cast<int64_t>(values.size()), is_valid);
  }

  /// \brief Append a sequence of elements in one shot
  /// \param[in] values a std::vector of values
  /// \return Status
  Status AppendValues(const std::vector<value_type>& values) {
    return AppendValues(values.data(), static_cast<int64_t>(values.size()));
  }

  Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
    std::shared_ptr<Buffer> data, null_bitmap;
    ARROW_RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
    ARROW_RETURN_NOT_OK(data_builder_.Finish(&data));
    *out = ArrayData::Make(type(), length_, {null_bitmap, data}, null_count_);
    capacity_ = length_ = null_count_ = 0;
    return Status::OK();
  }

  /// \cond FALSE
  using ArrayBuilder::Finish;
  /// \endcond

  Status Finish(std::shared_ptr<ArrayType>* out) { return FinishTyped(out); }

  /// \brief Append a sequence of elements in one shot
  /// \param[in] values_begin InputIterator to the beginning of the values
  /// \param[in] values_end InputIterator pointing to the end of the values
  /// \return Status
  template <typename ValuesIter>
  Status AppendValues(ValuesIter values_begin, ValuesIter values_end) {
    int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
    ARROW_RETURN_NOT_OK(Reserve(length));
    data_builder_.UnsafeAppend(values_begin, values_end);
    // this updates the length_
    UnsafeSetNotNull(length);
    return Status::OK();
  }

  /// \brief Append a sequence of elements in one shot, with a specified nullmap
  /// \param[in] values_begin InputIterator to the beginning of the values
  /// \param[in] values_end InputIterator pointing to the end of the values
  /// \param[in] valid_begin InputIterator with elements indication valid(1)
  ///  or null(0) values.
  /// \return Status
  template <typename ValuesIter, typename ValidIter>
  enable_if_t<!std::is_pointer<ValidIter>::value, Status> AppendValues(
      ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) {
    static_assert(!internal::is_null_pointer<ValidIter>::value,
                  "Don't pass a NULLPTR directly as valid_begin, use the 2-argument "
                  "version instead");
    int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
    ARROW_RETURN_NOT_OK(Reserve(length));
    data_builder_.UnsafeAppend(values_begin, values_end);
    null_bitmap_builder_.UnsafeAppend<true>(
        length, [&valid_begin]() -> bool { return *valid_begin++; });
    length_ = null_bitmap_builder_.length();
    null_count_ = null_bitmap_builder_.false_count();
    return Status::OK();
  }

  // Same as above, with a pointer type ValidIter
  template <typename ValuesIter, typename ValidIter>
  enable_if_t<std::is_pointer<ValidIter>::value, Status> AppendValues(
      ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) {
    int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
    ARROW_RETURN_NOT_OK(Reserve(length));
    data_builder_.UnsafeAppend(values_begin, values_end);
    // this updates the length_
    if (valid_begin == NULLPTR) {
      UnsafeSetNotNull(length);
    } else {
      null_bitmap_builder_.UnsafeAppend<true>(
          length, [&valid_begin]() -> bool { return *valid_begin++; });
      length_ = null_bitmap_builder_.length();
      null_count_ = null_bitmap_builder_.false_count();
    }

    return Status::OK();
  }

  /// Append a single scalar under the assumption that the underlying Buffer is
  /// large enough.
  ///
  /// This method does not capacity-check; make sure to call Reserve
  /// beforehand.
  void UnsafeAppend(const value_type val) {
    ArrayBuilder::UnsafeAppendToBitmap(true);
    data_builder_.UnsafeAppend(val);
  }

  void UnsafeAppendNull() {
    ArrayBuilder::UnsafeAppendToBitmap(false);
    data_builder_.UnsafeAppend(value_type{});  // zero
  }

  std::shared_ptr<DataType> type() const override { return type_; }

 protected:
  std::shared_ptr<DataType> type_;
  TypedBufferBuilder<value_type> data_builder_;
};

ArrayBuilder

/// Base class for all data array builders.
///
/// This class provides a facilities for incrementally building the null bitmap
/// (see Append methods) and as a side effect the current number of slots and
/// the null count.
///
/// \note Users are expected to use builders as one of the concrete types below.
/// For example, ArrayBuilder* pointing to BinaryBuilder should be downcast before use.
class ARROW_EXPORT ArrayBuilder {
 public:
  explicit ArrayBuilder(MemoryPool* pool) : pool_(pool), null_bitmap_builder_(pool) {}

  virtual ~ArrayBuilder() = default;

  /// For nested types. Since the objects are owned by this class instance, we
  /// skip shared pointers and just return a raw pointer
  ArrayBuilder* child(int i) { return children_[i].get(); }

  const std::shared_ptr<ArrayBuilder>& child_builder(int i) const { return children_[i]; }

  int num_children() const { return static_cast<int>(children_.size()); }

  virtual int64_t length() const { return length_; }
  int64_t null_count() const { return null_count_; }
  int64_t capacity() const { return capacity_; }

  /// \brief Ensure that enough memory has been allocated to fit the indicated
  /// number of total elements in the builder, including any that have already
  /// been appended. Does not account for reallocations that may be due to
  /// variable size data, like binary values. To make space for incremental
  /// appends, use Reserve instead.
  ///
  /// \param[in] capacity the minimum number of total array values to
  ///            accommodate. Must be greater than the current capacity.
  /// \return Status
  virtual Status Resize(int64_t capacity);

  /// \brief Ensure that there is enough space allocated to append the indicated
  /// number of elements without any further reallocation. Overallocation is
  /// used in order to minimize the impact of incremental Reserve() calls.
  /// Note that additional_capacity is relative to the current number of elements
  /// rather than to the current capacity, so calls to Reserve() which are not
  /// interspersed with addition of new elements may not increase the capacity.
  ///
  /// \param[in] additional_capacity the number of additional array values
  /// \return Status
  Status Reserve(int64_t additional_capacity) {
    auto current_capacity = capacity();
    auto min_capacity = length() + additional_capacity;
    if (min_capacity <= current_capacity) return Status::OK();

    // leave growth factor up to BufferBuilder
    auto new_capacity = BufferBuilder::GrowByFactor(current_capacity, min_capacity);
    return Resize(new_capacity);
  }

  /// Reset the builder.
  virtual void Reset();

  /// \brief Append a null value to builder
  virtual Status AppendNull() = 0;
  /// \brief Append a number of null values to builder
  virtual Status AppendNulls(int64_t length) = 0;

  /// \brief Append a non-null value to builder
  ///
  /// The appended value is an implementation detail, but the corresponding
  /// memory slot is guaranteed to be initialized.
  /// This method is useful when appending a null value to a parent nested type.
  virtual Status AppendEmptyValue() = 0;

  /// \brief Append a number of non-null values to builder
  ///
  /// The appended values are an implementation detail, but the corresponding
  /// memory slot is guaranteed to be initialized.
  /// This method is useful when appending null values to a parent nested type.
  virtual Status AppendEmptyValues(int64_t length) = 0;

  /// For cases where raw data was memcpy'd into the internal buffers, allows us
  /// to advance the length of the builder. It is your responsibility to use
  /// this function responsibly.
  Status Advance(int64_t elements);

  /// \brief Return result of builder as an internal generic ArrayData
  /// object. Resets builder except for dictionary builder
  ///
  /// \param[out] out the finalized ArrayData object
  /// \return Status
  virtual Status FinishInternal(std::shared_ptr<ArrayData>* out) = 0;

  /// \brief Return result of builder as an Array object.
  ///
  /// The builder is reset except for DictionaryBuilder.
  ///
  /// \param[out] out the finalized Array object
  /// \return Status
  Status Finish(std::shared_ptr<Array>* out);

  /// \brief Return result of builder as an Array object.
  ///
  /// The builder is reset except for DictionaryBuilder.
  ///
  /// \return The finalized Array object
  Result<std::shared_ptr<Array>> Finish();

  /// \brief Return the type of the built Array
  virtual std::shared_ptr<DataType> type() const = 0;

 protected:
  /// Append to null bitmap
  Status AppendToBitmap(bool is_valid);

  /// Vector append. Treat each zero byte as a null.   If valid_bytes is null
  /// assume all of length bits are valid.
  Status AppendToBitmap(const uint8_t* valid_bytes, int64_t length);

  /// Uniform append.  Append N times the same validity bit.
  Status AppendToBitmap(int64_t num_bits, bool value);

  /// Set the next length bits to not null (i.e. valid).
  Status SetNotNull(int64_t length);

  // Unsafe operations (don't check capacity/don't resize)

  void UnsafeAppendNull() { UnsafeAppendToBitmap(false); }

  // Append to null bitmap, update the length
  void UnsafeAppendToBitmap(bool is_valid) {
    null_bitmap_builder_.UnsafeAppend(is_valid);
    ++length_;
    if (!is_valid) ++null_count_;
  }

  // Vector append. Treat each zero byte as a nullzero. If valid_bytes is null
  // assume all of length bits are valid.
  void UnsafeAppendToBitmap(const uint8_t* valid_bytes, int64_t length) {
    if (valid_bytes == NULLPTR) {
      return UnsafeSetNotNull(length);
    }
    null_bitmap_builder_.UnsafeAppend(valid_bytes, length);
    length_ += length;
    null_count_ = null_bitmap_builder_.false_count();
  }

  // Append the same validity value a given number of times.
  void UnsafeAppendToBitmap(const int64_t num_bits, bool value) {
    if (value) {
      UnsafeSetNotNull(num_bits);
    } else {
      UnsafeSetNull(num_bits);
    }
  }

  void UnsafeAppendToBitmap(const std::vector<bool>& is_valid);

  // Set the next validity bits to not null (i.e. valid).
  void UnsafeSetNotNull(int64_t length);

  // Set the next validity bits to null (i.e. invalid).
  void UnsafeSetNull(int64_t length);

  static Status TrimBuffer(const int64_t bytes_filled, ResizableBuffer* buffer);

  /// \brief Finish to an array of the specified ArrayType
  template <typename ArrayType>
  Status FinishTyped(std::shared_ptr<ArrayType>* out) {
    std::shared_ptr<Array> out_untyped;
    ARROW_RETURN_NOT_OK(Finish(&out_untyped));
    *out = std::static_pointer_cast<ArrayType>(std::move(out_untyped));
    return Status::OK();
  }

  // Check the requested capacity for validity
  Status CheckCapacity(int64_t new_capacity) {
    if (ARROW_PREDICT_FALSE(new_capacity < 0)) {
      return Status::Invalid(
          "Resize capacity must be positive (requested: ", new_capacity, ")");
    }

    if (ARROW_PREDICT_FALSE(new_capacity < length_)) {
      return Status::Invalid("Resize cannot downsize (requested: ", new_capacity,
                             ", current length: ", length_, ")");
    }

    return Status::OK();
  }

  // Check for array type
  Status CheckArrayType(const std::shared_ptr<DataType>& expected_type,
                        const Array& array, const char* message);
  Status CheckArrayType(Type::type expected_type, const Array& array,
                        const char* message);

  MemoryPool* pool_;

  TypedBufferBuilder<bool> null_bitmap_builder_;
  int64_t null_count_ = 0;

  // Array length, so far. Also, the index of the next element to be added
  int64_t length_ = 0;
  int64_t capacity_ = 0;

  // Child value array builders. These are owned by this class
  std::vector<std::shared_ptr<ArrayBuilder>> children_;

 private:
  ARROW_DISALLOW_COPY_AND_ASSIGN(ArrayBuilder);
};

NumericArray

/// Concrete Array class for numeric data.
template <typename TYPE>
class NumericArray : public PrimitiveArray {
 public:
  using TypeClass = TYPE;
  using value_type = typename TypeClass::c_type;
  using IteratorType = stl::ArrayIterator<NumericArray<TYPE>>;

  explicit NumericArray(const std::shared_ptr<ArrayData>& data) : PrimitiveArray(data) {}

  // Only enable this constructor without a type argument for types without additional
  // metadata
  template <typename T1 = TYPE>
  NumericArray(enable_if_parameter_free<T1, int64_t> length,
               const std::shared_ptr<Buffer>& data,
               const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
               int64_t null_count = kUnknownNullCount, int64_t offset = 0)
      : PrimitiveArray(TypeTraits<T1>::type_singleton(), length, data, null_bitmap,
                       null_count, offset) {}

  const value_type* raw_values() const {
    return reinterpret_cast<const value_type*>(raw_values_) + data_->offset;
  }

  value_type Value(int64_t i) const { return raw_values()[i]; }

  // For API compatibility with BinaryArray etc.
  value_type GetView(int64_t i) const { return Value(i); }

  IteratorType begin() const { return IteratorType(*this); }

  IteratorType end() const { return IteratorType(*this, length()); }

 protected:
  using PrimitiveArray::PrimitiveArray;
};

BaseBinaryArray

arrow/cpp/src/arrow/array/array_binary.h

// ----------------------------------------------------------------------
// Binary and String

/// Base class for variable-sized binary arrays, regardless of offset size
/// and logical interpretation.
template <typename TYPE>
class BaseBinaryArray : public FlatArray {
 public:
  using TypeClass = TYPE;
  using offset_type = typename TypeClass::offset_type;
  using IteratorType = stl::ArrayIterator<BaseBinaryArray<TYPE>>;

  /// Return the pointer to the given elements bytes
  // XXX should GetValue(int64_t i) return a string_view?
  const uint8_t* GetValue(int64_t i, offset_type* out_length) const {
    // Account for base offset
    i += data_->offset;
    const offset_type pos = raw_value_offsets_[i];
    *out_length = raw_value_offsets_[i + 1] - pos;
    return raw_data_ + pos;
  }

  /// \brief Get binary value as a string_view
  ///
  /// \param i the value index
  /// \return the view over the selected value
  util::string_view GetView(int64_t i) const {
    // Account for base offset
    i += data_->offset;
    const offset_type pos = raw_value_offsets_[i];
    return util::string_view(reinterpret_cast<const char*>(raw_data_ + pos),
                             raw_value_offsets_[i + 1] - pos);
  }

  /// \brief Get binary value as a std::string
  ///
  /// \param i the value index
  /// \return the value copied into a std::string
  std::string GetString(int64_t i) const { return std::string(GetView(i)); }

  /// Note that this buffer does not account for any slice offset
  std::shared_ptr<Buffer> value_offsets() const { return data_->buffers[1]; }

  /// Note that this buffer does not account for any slice offset
  std::shared_ptr<Buffer> value_data() const { return data_->buffers[2]; }

  const offset_type* raw_value_offsets() const {
    return raw_value_offsets_ + data_->offset;
  }

  const uint8_t* raw_data() const { return raw_data_; }

  /// \brief Return the data buffer absolute offset of the data for the value
  /// at the passed index.
  ///
  /// Does not perform boundschecking
  offset_type value_offset(int64_t i) const {
    return raw_value_offsets_[i + data_->offset];
  }

  /// \brief Return the length of the data for the value at the passed index.
  ///
  /// Does not perform boundschecking
  offset_type value_length(int64_t i) const {
    i += data_->offset;
    return raw_value_offsets_[i + 1] - raw_value_offsets_[i];
  }

  /// \brief Return the total length of the memory in the data buffer
  /// referenced by this array. If the array has been sliced then this may be
  /// less than the size of the data buffer (data_->buffers[2]).
  offset_type total_values_length() const {
    if (data_->length > 0) {
      return raw_value_offsets_[data_->length + data_->offset] -
             raw_value_offsets_[data_->offset];
    } else {
      return 0;
    }
  }

  IteratorType begin() const { return IteratorType(*this); }

  IteratorType end() const { return IteratorType(*this, length()); }

 protected:
  // For subclasses
  BaseBinaryArray() = default;

  // Protected method for constructors
  void SetData(const std::shared_ptr<ArrayData>& data) {
    this->Array::SetData(data);
    raw_value_offsets_ = data->GetValuesSafe<offset_type>(1, /*offset=*/0);
    raw_data_ = data->GetValuesSafe<uint8_t>(2, /*offset=*/0);
  }

  const offset_type* raw_value_offsets_ = NULLPTR;
  const uint8_t* raw_data_ = NULLPTR;
};

FlatArray

arrow/cpp/src/arrow/array/array_base.h

/// Base class for non-nested arrays
class ARROW_EXPORT FlatArray : public Array {
 protected:
  using Array::Array;
};

ListArray

/// Concrete Array class for list data
class ARROW_EXPORT ListArray : public BaseListArray<ListType> {
 public:
  explicit ListArray(std::shared_ptr<ArrayData> data);

  ListArray(std::shared_ptr<DataType> type, int64_t length,
            std::shared_ptr<Buffer> value_offsets, std::shared_ptr<Array> values,
            std::shared_ptr<Buffer> null_bitmap = NULLPTR,
            int64_t null_count = kUnknownNullCount, int64_t offset = 0);

  /// \brief Construct ListArray from array of offsets and child value array
  ///
  /// This function does the bare minimum of validation of the offsets and
  /// input types, and will allocate a new offsets array if necessary (i.e. if
  /// the offsets contain any nulls). If the offsets do not have nulls, they
  /// are assumed to be well-formed
  ///
  /// \param[in] offsets Array containing n + 1 offsets encoding length and
  /// size. Must be of int32 type
  /// \param[in] values Array containing list values
  /// \param[in] pool MemoryPool in case new offsets array needs to be
  /// allocated because of null values
  static Result<std::shared_ptr<ListArray>> FromArrays(
      const Array& offsets, const Array& values,
      MemoryPool* pool = default_memory_pool());

  /// \brief Return an Array that is a concatenation of the lists in this array.
  ///
  /// Note that it's different from `values()` in that it takes into
  /// consideration of this array's offsets as well as null elements backed
  /// by non-empty lists (they are skipped, thus copying may be needed).
  Result<std::shared_ptr<Array>> Flatten(
      MemoryPool* memory_pool = default_memory_pool()) const;

  /// \brief Return list offsets as an Int32Array
  std::shared_ptr<Array> offsets() const;

 protected:
  // This constructor defers SetData to a derived array class
  ListArray() = default;

  void SetData(const std::shared_ptr<ArrayData>& data);
};

BaseListArray

/// Base class for variable-sized list arrays, regardless of offset size.
template <typename TYPE>
class BaseListArray : public Array {
 public:
  using TypeClass = TYPE;
  using offset_type = typename TypeClass::offset_type;

  const TypeClass* list_type() const { return list_type_; }

  /// \brief Return array object containing the list's values
  std::shared_ptr<Array> values() const { return values_; }

  /// Note that this buffer does not account for any slice offset
  std::shared_ptr<Buffer> value_offsets() const { return data_->buffers[1]; }

  std::shared_ptr<DataType> value_type() const { return list_type_->value_type(); }

  /// Return pointer to raw value offsets accounting for any slice offset
  const offset_type* raw_value_offsets() const {
    return raw_value_offsets_ + data_->offset;
  }

  // The following functions will not perform boundschecking
  offset_type value_offset(int64_t i) const {
    return raw_value_offsets_[i + data_->offset];
  }
  offset_type value_length(int64_t i) const {
    i += data_->offset;
    return raw_value_offsets_[i + 1] - raw_value_offsets_[i];
  }
  std::shared_ptr<Array> value_slice(int64_t i) const {
    return values_->Slice(value_offset(i), value_length(i));
  }

 protected:
  friend void internal::SetListData<TYPE>(BaseListArray<TYPE>* self,
                                          const std::shared_ptr<ArrayData>& data,
                                          Type::type expected_type_id);

  const TypeClass* list_type_ = NULLPTR;
  std::shared_ptr<Array> values_;
  const offset_type* raw_value_offsets_ = NULLPTR;
};
SetListData
// Private helper for ListArray::SetData.
// Unfortunately, trying to define BaseListArray::SetData outside of this header
// doesn't play well with MSVC.
template <typename TYPE>
void SetListData(BaseListArray<TYPE>* self, const std::shared_ptr<ArrayData>& data,
                 Type::type expected_type_id = TYPE::type_id);

}  // namespace internal
template <typename TYPE>
inline void SetListData(BaseListArray<TYPE>* self, const std::shared_ptr<ArrayData>& data,
                        Type::type expected_type_id) {
  ARROW_CHECK_EQ(data->buffers.size(), 2);
  ARROW_CHECK_EQ(data->type->id(), expected_type_id);
  ARROW_CHECK_EQ(data->child_data.size(), 1);

  self->Array::SetData(data);

  self->list_type_ = checked_cast<const TYPE*>(data->type.get());
  self->raw_value_offsets_ =
      data->GetValuesSafe<typename TYPE::offset_type>(1, /*offset=*/0);

  ARROW_CHECK_EQ(self->list_type_->value_type()->id(), data->child_data[0]->type->id());
  DCHECK(self->list_type_->value_type()->Equals(data->child_data[0]->type));
  self->values_ = MakeArray(self->data_->child_data[0]);
}
SetData
void ListArray::SetData(const std::shared_ptr<ArrayData>& data) {
  internal::SetListData(this, data);
}
ListArray::ListArray(std::shared_ptr<ArrayData> data) { SetData(std::move(data)); }
FinishInternal

The key implementation of recursion is here:
“std::shared_ptr<ArrayData> items;”

Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
    ARROW_RETURN_NOT_OK(AppendNextOffset());

    // Offset padding zeroed by BufferBuilder
    std::shared_ptr<Buffer> offsets, null_bitmap;
    ARROW_RETURN_NOT_OK(offsets_builder_.Finish(&offsets));
    ARROW_RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));

    if (value_builder_->length() == 0) {
      // Try to make sure we get a non-null values buffer (ARROW-2744)
      ARROW_RETURN_NOT_OK(value_builder_->Resize(0));
    }

    std::shared_ptr<ArrayData> items;
    ARROW_RETURN_NOT_OK(value_builder_->FinishInternal(&items));

    *out = ArrayData::Make(type(), length_, {null_bitmap, offsets}, {std::move(items)},
                           null_count_);
    Reset();
    return Status::OK();
  }

Array

arrow/cpp/src/arrow/array/array_base.h

// User array accessor types

/// \brief Array base type
/// Immutable data array with some logical type and some length.
///
/// Any memory is owned by the respective Buffer instance (or its parents).
///
/// The base class is only required to have a null bitmap buffer if the null
/// count is greater than 0
///
/// If known, the null count can be provided in the base Array constructor. If
/// the null count is not known, pass -1 to indicate that the null count is to
/// be computed on the first call to null_count()
class ARROW_EXPORT Array {
 public:
  virtual ~Array() = default;

  /// \brief Return true if value at index is null. Does not boundscheck
  bool IsNull(int64_t i) const {
    return null_bitmap_data_ != NULLPTR &&
           !BitUtil::GetBit(null_bitmap_data_, i + data_->offset);
  }

  /// \brief Return true if value at index is valid (not null). Does not
  /// boundscheck
  bool IsValid(int64_t i) const {
    return null_bitmap_data_ == NULLPTR ||
           BitUtil::GetBit(null_bitmap_data_, i + data_->offset);
  }

  /// \brief Return a Scalar containing the value of this array at i
  Result<std::shared_ptr<Scalar>> GetScalar(int64_t i) const;

  /// Size in the number of elements this array contains.
  int64_t length() const { return data_->length; }

  /// A relative position into another array's data, to enable zero-copy
  /// slicing. This value defaults to zero
  int64_t offset() const { return data_->offset; }

  /// The number of null entries in the array. If the null count was not known
  /// at time of construction (and set to a negative value), then the null
  /// count will be computed and cached on the first invocation of this
  /// function
  int64_t null_count() const;

  std::shared_ptr<DataType> type() const { return data_->type; }
  Type::type type_id() const { return data_->type->id(); }

  /// Buffer for the validity (null) bitmap, if any. Note that Union types
  /// never have a null bitmap.
  ///
  /// Note that for `null_count == 0` or for null type, this will be null.
  /// This buffer does not account for any slice offset
  const std::shared_ptr<Buffer>& null_bitmap() const { return data_->buffers[0]; }

  /// Raw pointer to the null bitmap.
  ///
  /// Note that for `null_count == 0` or for null type, this will be null.
  /// This buffer does not account for any slice offset
  const uint8_t* null_bitmap_data() const { return null_bitmap_data_; }

  /// Equality comparison with another array
  bool Equals(const Array& arr, const EqualOptions& = EqualOptions::Defaults()) const;
  bool Equals(const std::shared_ptr<Array>& arr,
              const EqualOptions& = EqualOptions::Defaults()) const;

  /// \brief Return the formatted unified diff of arrow::Diff between this
  /// Array and another Array
  std::string Diff(const Array& other) const;

  /// Approximate equality comparison with another array
  ///
  /// epsilon is only used if this is FloatArray or DoubleArray
  bool ApproxEquals(const std::shared_ptr<Array>& arr,
                    const EqualOptions& = EqualOptions::Defaults()) const;
  bool ApproxEquals(const Array& arr,
                    const EqualOptions& = EqualOptions::Defaults()) const;

  /// Compare if the range of slots specified are equal for the given array and
  /// this array.  end_idx exclusive.  This methods does not bounds check.
  bool RangeEquals(int64_t start_idx, int64_t end_idx, int64_t other_start_idx,
                   const Array& other,
                   const EqualOptions& = EqualOptions::Defaults()) const;
  bool RangeEquals(int64_t start_idx, int64_t end_idx, int64_t other_start_idx,
                   const std::shared_ptr<Array>& other,
                   const EqualOptions& = EqualOptions::Defaults()) const;
  bool RangeEquals(const Array& other, int64_t start_idx, int64_t end_idx,
                   int64_t other_start_idx,
                   const EqualOptions& = EqualOptions::Defaults()) const;
  bool RangeEquals(const std::shared_ptr<Array>& other, int64_t start_idx,
                   int64_t end_idx, int64_t other_start_idx,
                   const EqualOptions& = EqualOptions::Defaults()) const;

  Status Accept(ArrayVisitor* visitor) const;

  /// Construct a zero-copy view of this array with the given type.
  ///
  /// This method checks if the types are layout-compatible.
  /// Nested types are traversed in depth-first order. Data buffers must have
  /// the same item sizes, even though the logical types may be different.
  /// An error is returned if the types are not layout-compatible.
  Result<std::shared_ptr<Array>> View(const std::shared_ptr<DataType>& type) const;

  /// Construct a zero-copy slice of the array with the indicated offset and
  /// length
  ///
  /// \param[in] offset the position of the first element in the constructed
  /// slice
  /// \param[in] length the length of the slice. If there are not enough
  /// elements in the array, the length will be adjusted accordingly
  ///
  /// \return a new object wrapped in std::shared_ptr<Array>
  std::shared_ptr<Array> Slice(int64_t offset, int64_t length) const;

  /// Slice from offset until end of the array
  std::shared_ptr<Array> Slice(int64_t offset) const;

  /// Input-checking variant of Array::Slice
  Result<std::shared_ptr<Array>> SliceSafe(int64_t offset, int64_t length) const;
  /// Input-checking variant of Array::Slice
  Result<std::shared_ptr<Array>> SliceSafe(int64_t offset) const;

  const std::shared_ptr<ArrayData>& data() const { return data_; }

  int num_fields() const { return static_cast<int>(data_->child_data.size()); }

  /// \return PrettyPrint representation of array suitable for debugging
  std::string ToString() const;

  /// \brief Perform cheap validation checks to determine obvious inconsistencies
  /// within the array's internal data.
  ///
  /// This is O(k) where k is the number of descendents.
  ///
  /// \return Status
  Status Validate() const;

  /// \brief Perform extensive validation checks to determine inconsistencies
  /// within the array's internal data.
  ///
  /// This is potentially O(k*n) where k is the number of descendents and n
  /// is the array length.
  ///
  /// \return Status
  Status ValidateFull() const;

 protected:
  Array() : null_bitmap_data_(NULLPTR) {}

  std::shared_ptr<ArrayData> data_;
  const uint8_t* null_bitmap_data_;

  /// Protected method for constructors
  void SetData(const std::shared_ptr<ArrayData>& data) {
    if (data->buffers.size() > 0) {
      null_bitmap_data_ = data->GetValuesSafe<uint8_t>(0, /*offset=*/0);
    } else {
      null_bitmap_data_ = NULLPTR;
    }
    data_ = data;
  }

 private:
  ARROW_DISALLOW_COPY_AND_ASSIGN(Array);
};

BufferBuilder

arrow/cpp/src/arrow/buffer_builder.h

// ----------------------------------------------------------------------
// Buffer builder classes

/// \class BufferBuilder
/// \brief A class for incrementally building a contiguous chunk of in-memory
/// data
class ARROW_EXPORT BufferBuilder {
 public:
  explicit BufferBuilder(MemoryPool* pool = default_memory_pool())
      : pool_(pool),
        data_(/*ensure never null to make ubsan happy and avoid check penalties below*/
              &util::internal::non_null_filler),

        capacity_(0),
        size_(0) {}

  /// \brief Constructs new Builder that will start using
  /// the provided buffer until Finish/Reset are called.
  /// The buffer is not resized.
  explicit BufferBuilder(std::shared_ptr<ResizableBuffer> buffer,
                         MemoryPool* pool = default_memory_pool())
      : buffer_(std::move(buffer)),
        pool_(pool),
        data_(buffer_->mutable_data()),
        capacity_(buffer_->capacity()),
        size_(buffer_->size()) {}

  /// \brief Resize the buffer to the nearest multiple of 64 bytes
  ///
  /// \param new_capacity the new capacity of the of the builder. Will be
  /// rounded up to a multiple of 64 bytes for padding \param shrink_to_fit if
  /// new capacity is smaller than the existing size, reallocate internal
  /// buffer. Set to false to avoid reallocations when shrinking the builder.
  /// \return Status
  Status Resize(const int64_t new_capacity, bool shrink_to_fit = true) {
    // Resize(0) is a no-op
    if (new_capacity == 0) {
      return Status::OK();
    }
    if (buffer_ == NULLPTR) {
      ARROW_ASSIGN_OR_RAISE(buffer_, AllocateResizableBuffer(new_capacity, pool_));
    } else {
      ARROW_RETURN_NOT_OK(buffer_->Resize(new_capacity, shrink_to_fit));
    }
    capacity_ = buffer_->capacity();
    data_ = buffer_->mutable_data();
    return Status::OK();
  }

  /// \brief Ensure that builder can accommodate the additional number of bytes
  /// without the need to perform allocations
  ///
  /// \param[in] additional_bytes number of additional bytes to make space for
  /// \return Status
  Status Reserve(const int64_t additional_bytes) {
    auto min_capacity = size_ + additional_bytes;
    if (min_capacity <= capacity_) {
      return Status::OK();
    }
    return Resize(GrowByFactor(capacity_, min_capacity), false);
  }

  /// \brief Return a capacity expanded by the desired growth factor
  static int64_t GrowByFactor(int64_t current_capacity, int64_t new_capacity) {
    // Doubling capacity except for large Reserve requests. 2x growth strategy
    // (versus 1.5x) seems to have slightly better performance when using
    // jemalloc, but significantly better performance when using the system
    // allocator. See ARROW-6450 for further discussion
    return std::max(new_capacity, current_capacity * 2);
  }

  /// \brief Append the given data to the buffer
  ///
  /// The buffer is automatically expanded if necessary.
  Status Append(const void* data, const int64_t length) {
    if (ARROW_PREDICT_FALSE(size_ + length > capacity_)) {
      ARROW_RETURN_NOT_OK(Resize(GrowByFactor(capacity_, size_ + length), false));
    }
    UnsafeAppend(data, length);
    return Status::OK();
  }

  /// \brief Append copies of a value to the buffer
  ///
  /// The buffer is automatically expanded if necessary.
  Status Append(const int64_t num_copies, uint8_t value) {
    ARROW_RETURN_NOT_OK(Reserve(num_copies));
    UnsafeAppend(num_copies, value);
    return Status::OK();
  }

  // Advance pointer and zero out memory
  Status Advance(const int64_t length) { return Append(length, 0); }

  // Advance pointer, but don't allocate or zero memory
  void UnsafeAdvance(const int64_t length) { size_ += length; }

  // Unsafe methods don't check existing size
  void UnsafeAppend(const void* data, const int64_t length) {
    memcpy(data_ + size_, data, static_cast<size_t>(length));
    size_ += length;
  }

  void UnsafeAppend(const int64_t num_copies, uint8_t value) {
    memset(data_ + size_, value, static_cast<size_t>(num_copies));
    size_ += num_copies;
  }

  /// \brief Return result of builder as a Buffer object.
  ///
  /// The builder is reset and can be reused afterwards.
  ///
  /// \param[out] out the finalized Buffer object
  /// \param shrink_to_fit if the buffer size is smaller than its capacity,
  /// reallocate to fit more tightly in memory. Set to false to avoid
  /// a reallocation, at the expense of potentially more memory consumption.
  /// \return Status
  Status Finish(std::shared_ptr<Buffer>* out, bool shrink_to_fit = true) {
    ARROW_RETURN_NOT_OK(Resize(size_, shrink_to_fit));
    if (size_ != 0) buffer_->ZeroPadding();
    *out = buffer_;
    if (*out == NULLPTR) {
      ARROW_ASSIGN_OR_RAISE(*out, AllocateBuffer(0, pool_));
    }
    Reset();
    return Status::OK();
  }

  Result<std::shared_ptr<Buffer>> Finish(bool shrink_to_fit = true) {
    std::shared_ptr<Buffer> out;
    ARROW_RETURN_NOT_OK(Finish(&out, shrink_to_fit));
    return out;
  }

  void Reset() {
    buffer_ = NULLPTR;
    capacity_ = size_ = 0;
  }

  /// \brief Set size to a smaller value without modifying builder
  /// contents. For reusable BufferBuilder classes
  /// \param[in] position must be non-negative and less than or equal
  /// to the current length()
  void Rewind(int64_t position) { size_ = position; }

  int64_t capacity() const { return capacity_; }
  int64_t length() const { return size_; }
  const uint8_t* data() const { return data_; }
  uint8_t* mutable_data() { return data_; }

 private:
  std::shared_ptr<ResizableBuffer> buffer_;
  MemoryPool* pool_;
  uint8_t* data_;
  int64_t capacity_;
  int64_t size_;
};
 类似资料: