当前位置: 首页 > 工具软件 > Arrow > 使用案例 >

Arrow 之 Type

乔鸿骞
2023-12-01

BinaryType

/// \brief Concrete type class for variable-size binary data
class ARROW_EXPORT BinaryType : public BaseBinaryType {
 public:
  static constexpr Type::type type_id = Type::BINARY;
  static constexpr bool is_utf8 = false;
  using offset_type = int32_t;
  using PhysicalType = BinaryType;

  static constexpr const char* type_name() { return "binary"; }

  BinaryType() : BinaryType(Type::BINARY) {}

  DataTypeLayout layout() const override {
    return DataTypeLayout({DataTypeLayout::Bitmap(),
                           DataTypeLayout::FixedWidth(sizeof(offset_type)),
                           DataTypeLayout::VariableWidth()});
  }

  std::string ToString() const override;
  std::string name() const override { return "binary"; }

 protected:
  std::string ComputeFingerprint() const override;

  // Allow subclasses like StringType to change the logical type.
  explicit BinaryType(Type::type logical_type) : BaseBinaryType(logical_type) {}
};
/// EXPERIMENTAL: Layout specification for a data type
struct ARROW_EXPORT DataTypeLayout {
  enum BufferKind { FIXED_WIDTH, VARIABLE_WIDTH, BITMAP, ALWAYS_NULL };

  /// Layout specification for a single data type buffer
  struct BufferSpec {
    BufferKind kind;
    int64_t byte_width;  // For FIXED_WIDTH

    bool operator==(const BufferSpec& other) const {
      return kind == other.kind &&
             (kind != FIXED_WIDTH || byte_width == other.byte_width);
    }
    bool operator!=(const BufferSpec& other) const { return !(*this == other); }
  };

  static BufferSpec FixedWidth(int64_t w) { return BufferSpec{FIXED_WIDTH, w}; }
  static BufferSpec VariableWidth() { return BufferSpec{VARIABLE_WIDTH, -1}; }
  static BufferSpec Bitmap() { return BufferSpec{BITMAP, -1}; }
  static BufferSpec AlwaysNull() { return BufferSpec{ALWAYS_NULL, -1}; }

  /// A vector of buffer layout specifications, one for each expected buffer
  std::vector<BufferSpec> buffers;
  /// Whether this type expects an associated dictionary array.
  bool has_dictionary = false;

  explicit DataTypeLayout(std::vector<BufferSpec> v) : buffers(std::move(v)) {}
};

ListType

ToString()

List is recursive. And the ToString() implement the recursion.

std::string ListType::ToString() const {
  std::stringstream s;
  s << "list<" << value_field()->ToString() << ">";
  return s.str();
}

StructType

arrow/cpp/src/arrow/type.h
arrow/cpp/src/arrow/type.cc

ToString()
std::string StructType::ToString() const {
  std::stringstream s;
  s << "struct<";
  for (int i = 0; i < this->num_fields(); ++i) {
    if (i > 0) {
      s << ", ";
    }
    std::shared_ptr<Field> field = this->field(i);
    s << field->ToString();
  }
  s << ">";
  return s.str();
}

MapType

  MapType(std::shared_ptr<DataType> key_type, std::shared_ptr<DataType> item_type,
          bool keys_sorted = false);

  MapType(std::shared_ptr<DataType> key_type, std::shared_ptr<Field> item_field,
          bool keys_sorted = false);

  MapType(std::shared_ptr<Field> key_field, std::shared_ptr<Field> item_field,
          bool keys_sorted = false);

  explicit MapType(std::shared_ptr<Field> value_field, bool keys_sorted = false);
...
  std::shared_ptr<Field> key_field() const { return value_type()->field(0); }
  std::shared_ptr<DataType> key_type() const { return key_field()->type(); }

  std::shared_ptr<Field> item_field() const { return value_type()->field(1); }
  std::shared_ptr<DataType> item_type() const { return item_field()->type(); }

type.cc

MapType::MapType(std::shared_ptr<DataType> key_type, std::shared_ptr<DataType> item_type,
                 bool keys_sorted)
    : MapType(::arrow::field("key", std::move(key_type), false),
              ::arrow::field("value", std::move(item_type)), keys_sorted) {}

MapType::MapType(std::shared_ptr<DataType> key_type, std::shared_ptr<Field> item_field,
                 bool keys_sorted)
    : MapType(::arrow::field("key", std::move(key_type), false), std::move(item_field),
              keys_sorted) {}

MapType::MapType(std::shared_ptr<Field> key_field, std::shared_ptr<Field> item_field,
                 bool keys_sorted)
    : MapType(
          ::arrow::field("entries",
                         struct_({std::move(key_field), std::move(item_field)}), false),
          keys_sorted) {}

MapType::MapType(std::shared_ptr<Field> value_field, bool keys_sorted)
    : ListType(std::move(value_field)), keys_sorted_(keys_sorted) {
  id_ = type_id;
}

Result<std::shared_ptr<DataType>> MapType::Make(std::shared_ptr<Field> value_field,
                                                bool keys_sorted) {
  const auto& value_type = *value_field->type();
  if (value_field->nullable() || value_type.id() != Type::STRUCT) {
    return Status::TypeError("Map entry field should be non-nullable struct");
  }
  const auto& struct_type = checked_cast<const StructType&>(value_type);
  if (struct_type.num_fields() != 2) {
    return Status::TypeError("Map entry field should have two children (got ",
                             struct_type.num_fields(), ")");
  }
  if (struct_type.field(0)->nullable()) {
    return Status::TypeError("Map key field should be non-nullable");
  }
  return std::make_shared<MapType>(std::move(value_field), keys_sorted);
}

ToString()
std::string MapType::ToString() const {
  std::stringstream s;

  const auto print_field_name = [](std::ostream& os, const Field& field,
                                   const char* std_name) {
    if (field.name() != std_name) {
      os << " ('" << field.name() << "')";
    }
  };
  const auto print_field = [&](std::ostream& os, const Field& field,
                               const char* std_name) {
    os << field.type()->ToString();
    print_field_name(os, field, std_name);
  };

  s << "map<";
  print_field(s, *key_field(), "key");
  s << ", ";
  print_field(s, *item_field(), "value");
  if (keys_sorted_) {
    s << ", keys_sorted";
  }
  print_field_name(s, *value_field(), "entries");
  s << ">";
  return s.str();
}

Types

static constexpr Type::type type_id = Type::MAP;

arrow/cpp/src/arrow/type_fwd.h

struct Type {
  /// \brief Main data type enumeration
  ///
  /// This enumeration provides a quick way to interrogate the category
  /// of a DataType instance.
  enum type {
    /// A NULL type having no physical storage
    NA = 0,

    /// Boolean as 1 bit, LSB bit-packed ordering
    BOOL,

    /// Unsigned 8-bit little-endian integer
    UINT8,

    /// Signed 8-bit little-endian integer
    INT8,

    /// Unsigned 16-bit little-endian integer
    UINT16,

    /// Signed 16-bit little-endian integer
    INT16,

    /// Unsigned 32-bit little-endian integer
    UINT32,

    /// Signed 32-bit little-endian integer
    INT32,

    /// Unsigned 64-bit little-endian integer
    UINT64,

    /// Signed 64-bit little-endian integer
    INT64,

    /// 2-byte floating point value
    HALF_FLOAT,

    /// 4-byte floating point value
    FLOAT,

    /// 8-byte floating point value
    DOUBLE,

    /// UTF8 variable-length string as List<Char>
    STRING,

    /// Variable-length bytes (no guarantee of UTF8-ness)
    BINARY,

    /// Fixed-size binary. Each value occupies the same number of bytes
    FIXED_SIZE_BINARY,

    /// int32_t days since the UNIX epoch
    DATE32,

    /// int64_t milliseconds since the UNIX epoch
    DATE64,

    /// Exact timestamp encoded with int64 since UNIX epoch
    /// Default unit millisecond
    TIMESTAMP,

    /// Time as signed 32-bit integer, representing either seconds or
    /// milliseconds since midnight
    TIME32,

    /// Time as signed 64-bit integer, representing either microseconds or
    /// nanoseconds since midnight
    TIME64,

    /// YEAR_MONTH interval in SQL style
    INTERVAL_MONTHS,

    /// DAY_TIME interval in SQL style
    INTERVAL_DAY_TIME,

    /// Precision- and scale-based decimal type with 128 bits.
    DECIMAL128,

    /// Defined for backward-compatibility.
    DECIMAL = DECIMAL128,

    /// Precision- and scale-based decimal type with 256 bits.
    DECIMAL256,

    /// A list of some logical data type
    LIST,

    /// Struct of logical types
    STRUCT,

    /// Sparse unions of logical types
    SPARSE_UNION,

    /// Dense unions of logical types
    DENSE_UNION,

    /// Dictionary-encoded type, also called "categorical" or "factor"
    /// in other programming languages. Holds the dictionary value
    /// type but not the dictionary itself, which is part of the
    /// ArrayData struct
    DICTIONARY,

    /// Map, a repeated struct logical type
    MAP,

    /// Custom data type, implemented by user
    EXTENSION,

    /// Fixed size list of some logical type
    FIXED_SIZE_LIST,

    /// Measure of elapsed time in either seconds, milliseconds, microseconds
    /// or nanoseconds.
    DURATION,

    /// Like STRING, but with 64-bit offsets
    LARGE_STRING,

    /// Like BINARY, but with 64-bit offsets
    LARGE_BINARY,

    /// Like LIST, but with 64-bit offsets
    LARGE_LIST,

    // Leave this at the end
    MAX_ID
  };
};
Create DataType Instance declaration

arrow/cpp/src/arrow/type_fwd.h

/// \defgroup type-factories Factory functions for creating data types
///
/// Factory functions for creating data types
/// @{

/// \brief Return a NullType instance
std::shared_ptr<DataType> ARROW_EXPORT null();
/// \brief Return a BooleanType instance
std::shared_ptr<DataType> ARROW_EXPORT boolean();
/// \brief Return a Int8Type instance
std::shared_ptr<DataType> ARROW_EXPORT int8();
/// \brief Return a Int16Type instance
std::shared_ptr<DataType> ARROW_EXPORT int16();
/// \brief Return a Int32Type instance
std::shared_ptr<DataType> ARROW_EXPORT int32();
/// \brief Return a Int64Type instance
std::shared_ptr<DataType> ARROW_EXPORT int64();
/// \brief Return a UInt8Type instance
std::shared_ptr<DataType> ARROW_EXPORT uint8();
...
Create DataType Instance implementation

arrow/cpp/src/arrow/type.cc

#define TYPE_FACTORY(NAME, KLASS)                                        \
  std::shared_ptr<DataType> NAME() {                                     \
    static std::shared_ptr<DataType> result = std::make_shared<KLASS>(); \
    return result;                                                       \
  }

TYPE_FACTORY(null, NullType)
TYPE_FACTORY(boolean, BooleanType)
TYPE_FACTORY(int8, Int8Type)
TYPE_FACTORY(uint8, UInt8Type)
TYPE_FACTORY(int16, Int16Type)
TYPE_FACTORY(uint16, UInt16Type)
TYPE_FACTORY(int32, Int32Type)
TYPE_FACTORY(uint32, UInt32Type)
TYPE_FACTORY(int64, Int64Type)
TYPE_FACTORY(uint64, UInt64Type)
TYPE_FACTORY(float16, HalfFloatType)
TYPE_FACTORY(float32, FloatType)
TYPE_FACTORY(float64, DoubleType)
TYPE_FACTORY(utf8, StringType)
TYPE_FACTORY(large_utf8, LargeStringType)
TYPE_FACTORY(binary, BinaryType)
TYPE_FACTORY(large_binary, LargeBinaryType)
TYPE_FACTORY(date64, Date64Type)
TYPE_FACTORY(date32, Date32Type)

std::shared_ptr<DataType> list(const std::shared_ptr<DataType>& value_type) {
  return std::make_shared<ListType>(value_type);
}
bit_width()

arrow/cpp/src/arrow/type.h

/// \brief Base class for all fixed-width data types
class ARROW_EXPORT FixedWidthType : public DataType {
 public:
  using DataType::DataType;

  virtual int bit_width() const = 0;
};
Get bit_width

arrow/cpp/src/arrow/type_traits.h

static inline int bit_width(Type::type type_id) {
  switch (type_id) {
    case Type::BOOL:
      return 1;
    case Type::UINT8:
    case Type::INT8:
      return 8;
    case Type::UINT16:
    case Type::INT16:
      return 16;
    case Type::UINT32:
    case Type::INT32:
    case Type::DATE32:
    case Type::TIME32:
      return 32;
    case Type::UINT64:
    case Type::INT64:
    case Type::DATE64:
    case Type::TIME64:
    case Type::TIMESTAMP:
    case Type::DURATION:
      return 64;

    case Type::HALF_FLOAT:
      return 16;
    case Type::FLOAT:
      return 32;
    case Type::DOUBLE:
      return 64;

    case Type::INTERVAL_MONTHS:
      return 32;
    case Type::INTERVAL_DAY_TIME:
      return 64;

    case Type::DECIMAL128:
      return 128;
    case Type::DECIMAL256:
      return 256;

    default:
      break;
  }
  return 0;
}
static inline int offset_bit_width(Type::type type_id) {
  switch (type_id) {
    case Type::STRING:
    case Type::BINARY:
    case Type::LIST:
    case Type::MAP:
    case Type::DENSE_UNION:
      return 32;
    case Type::LARGE_STRING:
    case Type::LARGE_BINARY:
    case Type::LARGE_LIST:
      return 64;
    default:
      break;
  }
  return 0;
}

DataType

/// \brief Base class for all data types
///
/// Data types in this library are all *logical*. They can be expressed as
/// either a primitive physical type (bytes or bits of some fixed size), a
/// nested type consisting of other data types, or another data type (e.g. a
/// timestamp encoded as an int64).
///
/// Simple datatypes may be entirely described by their Type::type id, but
/// complex datatypes are usually parametric.
class ARROW_EXPORT DataType : public detail::Fingerprintable {
 public:
  explicit DataType(Type::type id) : detail::Fingerprintable(), id_(id) {}
  ~DataType() override;
  ......
  /// Returns the child-field at index i.
  const std::shared_ptr<Field>& field(int i) const { return children_[i]; }
  ......
  protected:
  // Dummy version that returns a null string (indicating not implemented).
  // Subclasses should override for fast equality checks.
  std::string ComputeFingerprint() const override;

  // Generic versions that works for all regular types, nested or not.
  std::string ComputeMetadataFingerprint() const override;

  Type::type id_;
  std::vector<std::shared_ptr<Field>> children_;


Filed

/// \brief The combination of a field name and data type, with optional metadata
///
/// Fields are used to describe the individual constituents of a
/// nested DataType or a Schema.
///
/// A field's metadata is represented by a KeyValueMetadata instance,
/// which holds arbitrary key-value pairs.
class ARROW_EXPORT Field : public detail::Fingerprintable {
 public:
  Field(std::string name, std::shared_ptr<DataType> type, bool nullable = true,
        std::shared_ptr<const KeyValueMetadata> metadata = NULLPTR)
      : detail::Fingerprintable(),
        name_(std::move(name)),
        type_(std::move(type)),
        nullable_(nullable),
        metadata_(std::move(metadata)) {}

  ~Field() override;

 类似资料: