/// \brief Concrete type class for variable-size binary data
class ARROW_EXPORT BinaryType : public BaseBinaryType {
public:
static constexpr Type::type type_id = Type::BINARY;
static constexpr bool is_utf8 = false;
using offset_type = int32_t;
using PhysicalType = BinaryType;
static constexpr const char* type_name() { return "binary"; }
BinaryType() : BinaryType(Type::BINARY) {}
DataTypeLayout layout() const override {
return DataTypeLayout({DataTypeLayout::Bitmap(),
DataTypeLayout::FixedWidth(sizeof(offset_type)),
DataTypeLayout::VariableWidth()});
}
std::string ToString() const override;
std::string name() const override { return "binary"; }
protected:
std::string ComputeFingerprint() const override;
// Allow subclasses like StringType to change the logical type.
explicit BinaryType(Type::type logical_type) : BaseBinaryType(logical_type) {}
};
/// EXPERIMENTAL: Layout specification for a data type
struct ARROW_EXPORT DataTypeLayout {
enum BufferKind { FIXED_WIDTH, VARIABLE_WIDTH, BITMAP, ALWAYS_NULL };
/// Layout specification for a single data type buffer
struct BufferSpec {
BufferKind kind;
int64_t byte_width; // For FIXED_WIDTH
bool operator==(const BufferSpec& other) const {
return kind == other.kind &&
(kind != FIXED_WIDTH || byte_width == other.byte_width);
}
bool operator!=(const BufferSpec& other) const { return !(*this == other); }
};
static BufferSpec FixedWidth(int64_t w) { return BufferSpec{FIXED_WIDTH, w}; }
static BufferSpec VariableWidth() { return BufferSpec{VARIABLE_WIDTH, -1}; }
static BufferSpec Bitmap() { return BufferSpec{BITMAP, -1}; }
static BufferSpec AlwaysNull() { return BufferSpec{ALWAYS_NULL, -1}; }
/// A vector of buffer layout specifications, one for each expected buffer
std::vector<BufferSpec> buffers;
/// Whether this type expects an associated dictionary array.
bool has_dictionary = false;
explicit DataTypeLayout(std::vector<BufferSpec> v) : buffers(std::move(v)) {}
};
List is recursive. And the ToString() implement the recursion.
std::string ListType::ToString() const {
std::stringstream s;
s << "list<" << value_field()->ToString() << ">";
return s.str();
}
arrow/cpp/src/arrow/type.h
arrow/cpp/src/arrow/type.cc
std::string StructType::ToString() const {
std::stringstream s;
s << "struct<";
for (int i = 0; i < this->num_fields(); ++i) {
if (i > 0) {
s << ", ";
}
std::shared_ptr<Field> field = this->field(i);
s << field->ToString();
}
s << ">";
return s.str();
}
MapType(std::shared_ptr<DataType> key_type, std::shared_ptr<DataType> item_type,
bool keys_sorted = false);
MapType(std::shared_ptr<DataType> key_type, std::shared_ptr<Field> item_field,
bool keys_sorted = false);
MapType(std::shared_ptr<Field> key_field, std::shared_ptr<Field> item_field,
bool keys_sorted = false);
explicit MapType(std::shared_ptr<Field> value_field, bool keys_sorted = false);
...
std::shared_ptr<Field> key_field() const { return value_type()->field(0); }
std::shared_ptr<DataType> key_type() const { return key_field()->type(); }
std::shared_ptr<Field> item_field() const { return value_type()->field(1); }
std::shared_ptr<DataType> item_type() const { return item_field()->type(); }
type.cc
MapType::MapType(std::shared_ptr<DataType> key_type, std::shared_ptr<DataType> item_type,
bool keys_sorted)
: MapType(::arrow::field("key", std::move(key_type), false),
::arrow::field("value", std::move(item_type)), keys_sorted) {}
MapType::MapType(std::shared_ptr<DataType> key_type, std::shared_ptr<Field> item_field,
bool keys_sorted)
: MapType(::arrow::field("key", std::move(key_type), false), std::move(item_field),
keys_sorted) {}
MapType::MapType(std::shared_ptr<Field> key_field, std::shared_ptr<Field> item_field,
bool keys_sorted)
: MapType(
::arrow::field("entries",
struct_({std::move(key_field), std::move(item_field)}), false),
keys_sorted) {}
MapType::MapType(std::shared_ptr<Field> value_field, bool keys_sorted)
: ListType(std::move(value_field)), keys_sorted_(keys_sorted) {
id_ = type_id;
}
Result<std::shared_ptr<DataType>> MapType::Make(std::shared_ptr<Field> value_field,
bool keys_sorted) {
const auto& value_type = *value_field->type();
if (value_field->nullable() || value_type.id() != Type::STRUCT) {
return Status::TypeError("Map entry field should be non-nullable struct");
}
const auto& struct_type = checked_cast<const StructType&>(value_type);
if (struct_type.num_fields() != 2) {
return Status::TypeError("Map entry field should have two children (got ",
struct_type.num_fields(), ")");
}
if (struct_type.field(0)->nullable()) {
return Status::TypeError("Map key field should be non-nullable");
}
return std::make_shared<MapType>(std::move(value_field), keys_sorted);
}
std::string MapType::ToString() const {
std::stringstream s;
const auto print_field_name = [](std::ostream& os, const Field& field,
const char* std_name) {
if (field.name() != std_name) {
os << " ('" << field.name() << "')";
}
};
const auto print_field = [&](std::ostream& os, const Field& field,
const char* std_name) {
os << field.type()->ToString();
print_field_name(os, field, std_name);
};
s << "map<";
print_field(s, *key_field(), "key");
s << ", ";
print_field(s, *item_field(), "value");
if (keys_sorted_) {
s << ", keys_sorted";
}
print_field_name(s, *value_field(), "entries");
s << ">";
return s.str();
}
static constexpr Type::type type_id = Type::MAP;
arrow/cpp/src/arrow/type_fwd.h
struct Type {
/// \brief Main data type enumeration
///
/// This enumeration provides a quick way to interrogate the category
/// of a DataType instance.
enum type {
/// A NULL type having no physical storage
NA = 0,
/// Boolean as 1 bit, LSB bit-packed ordering
BOOL,
/// Unsigned 8-bit little-endian integer
UINT8,
/// Signed 8-bit little-endian integer
INT8,
/// Unsigned 16-bit little-endian integer
UINT16,
/// Signed 16-bit little-endian integer
INT16,
/// Unsigned 32-bit little-endian integer
UINT32,
/// Signed 32-bit little-endian integer
INT32,
/// Unsigned 64-bit little-endian integer
UINT64,
/// Signed 64-bit little-endian integer
INT64,
/// 2-byte floating point value
HALF_FLOAT,
/// 4-byte floating point value
FLOAT,
/// 8-byte floating point value
DOUBLE,
/// UTF8 variable-length string as List<Char>
STRING,
/// Variable-length bytes (no guarantee of UTF8-ness)
BINARY,
/// Fixed-size binary. Each value occupies the same number of bytes
FIXED_SIZE_BINARY,
/// int32_t days since the UNIX epoch
DATE32,
/// int64_t milliseconds since the UNIX epoch
DATE64,
/// Exact timestamp encoded with int64 since UNIX epoch
/// Default unit millisecond
TIMESTAMP,
/// Time as signed 32-bit integer, representing either seconds or
/// milliseconds since midnight
TIME32,
/// Time as signed 64-bit integer, representing either microseconds or
/// nanoseconds since midnight
TIME64,
/// YEAR_MONTH interval in SQL style
INTERVAL_MONTHS,
/// DAY_TIME interval in SQL style
INTERVAL_DAY_TIME,
/// Precision- and scale-based decimal type with 128 bits.
DECIMAL128,
/// Defined for backward-compatibility.
DECIMAL = DECIMAL128,
/// Precision- and scale-based decimal type with 256 bits.
DECIMAL256,
/// A list of some logical data type
LIST,
/// Struct of logical types
STRUCT,
/// Sparse unions of logical types
SPARSE_UNION,
/// Dense unions of logical types
DENSE_UNION,
/// Dictionary-encoded type, also called "categorical" or "factor"
/// in other programming languages. Holds the dictionary value
/// type but not the dictionary itself, which is part of the
/// ArrayData struct
DICTIONARY,
/// Map, a repeated struct logical type
MAP,
/// Custom data type, implemented by user
EXTENSION,
/// Fixed size list of some logical type
FIXED_SIZE_LIST,
/// Measure of elapsed time in either seconds, milliseconds, microseconds
/// or nanoseconds.
DURATION,
/// Like STRING, but with 64-bit offsets
LARGE_STRING,
/// Like BINARY, but with 64-bit offsets
LARGE_BINARY,
/// Like LIST, but with 64-bit offsets
LARGE_LIST,
// Leave this at the end
MAX_ID
};
};
arrow/cpp/src/arrow/type_fwd.h
/// \defgroup type-factories Factory functions for creating data types
///
/// Factory functions for creating data types
/// @{
/// \brief Return a NullType instance
std::shared_ptr<DataType> ARROW_EXPORT null();
/// \brief Return a BooleanType instance
std::shared_ptr<DataType> ARROW_EXPORT boolean();
/// \brief Return a Int8Type instance
std::shared_ptr<DataType> ARROW_EXPORT int8();
/// \brief Return a Int16Type instance
std::shared_ptr<DataType> ARROW_EXPORT int16();
/// \brief Return a Int32Type instance
std::shared_ptr<DataType> ARROW_EXPORT int32();
/// \brief Return a Int64Type instance
std::shared_ptr<DataType> ARROW_EXPORT int64();
/// \brief Return a UInt8Type instance
std::shared_ptr<DataType> ARROW_EXPORT uint8();
...
arrow/cpp/src/arrow/type.cc
#define TYPE_FACTORY(NAME, KLASS) \
std::shared_ptr<DataType> NAME() { \
static std::shared_ptr<DataType> result = std::make_shared<KLASS>(); \
return result; \
}
TYPE_FACTORY(null, NullType)
TYPE_FACTORY(boolean, BooleanType)
TYPE_FACTORY(int8, Int8Type)
TYPE_FACTORY(uint8, UInt8Type)
TYPE_FACTORY(int16, Int16Type)
TYPE_FACTORY(uint16, UInt16Type)
TYPE_FACTORY(int32, Int32Type)
TYPE_FACTORY(uint32, UInt32Type)
TYPE_FACTORY(int64, Int64Type)
TYPE_FACTORY(uint64, UInt64Type)
TYPE_FACTORY(float16, HalfFloatType)
TYPE_FACTORY(float32, FloatType)
TYPE_FACTORY(float64, DoubleType)
TYPE_FACTORY(utf8, StringType)
TYPE_FACTORY(large_utf8, LargeStringType)
TYPE_FACTORY(binary, BinaryType)
TYPE_FACTORY(large_binary, LargeBinaryType)
TYPE_FACTORY(date64, Date64Type)
TYPE_FACTORY(date32, Date32Type)
std::shared_ptr<DataType> list(const std::shared_ptr<DataType>& value_type) {
return std::make_shared<ListType>(value_type);
}
arrow/cpp/src/arrow/type.h
/// \brief Base class for all fixed-width data types
class ARROW_EXPORT FixedWidthType : public DataType {
public:
using DataType::DataType;
virtual int bit_width() const = 0;
};
arrow/cpp/src/arrow/type_traits.h
static inline int bit_width(Type::type type_id) {
switch (type_id) {
case Type::BOOL:
return 1;
case Type::UINT8:
case Type::INT8:
return 8;
case Type::UINT16:
case Type::INT16:
return 16;
case Type::UINT32:
case Type::INT32:
case Type::DATE32:
case Type::TIME32:
return 32;
case Type::UINT64:
case Type::INT64:
case Type::DATE64:
case Type::TIME64:
case Type::TIMESTAMP:
case Type::DURATION:
return 64;
case Type::HALF_FLOAT:
return 16;
case Type::FLOAT:
return 32;
case Type::DOUBLE:
return 64;
case Type::INTERVAL_MONTHS:
return 32;
case Type::INTERVAL_DAY_TIME:
return 64;
case Type::DECIMAL128:
return 128;
case Type::DECIMAL256:
return 256;
default:
break;
}
return 0;
}
static inline int offset_bit_width(Type::type type_id) {
switch (type_id) {
case Type::STRING:
case Type::BINARY:
case Type::LIST:
case Type::MAP:
case Type::DENSE_UNION:
return 32;
case Type::LARGE_STRING:
case Type::LARGE_BINARY:
case Type::LARGE_LIST:
return 64;
default:
break;
}
return 0;
}
/// \brief Base class for all data types
///
/// Data types in this library are all *logical*. They can be expressed as
/// either a primitive physical type (bytes or bits of some fixed size), a
/// nested type consisting of other data types, or another data type (e.g. a
/// timestamp encoded as an int64).
///
/// Simple datatypes may be entirely described by their Type::type id, but
/// complex datatypes are usually parametric.
class ARROW_EXPORT DataType : public detail::Fingerprintable {
public:
explicit DataType(Type::type id) : detail::Fingerprintable(), id_(id) {}
~DataType() override;
......
/// Returns the child-field at index i.
const std::shared_ptr<Field>& field(int i) const { return children_[i]; }
......
protected:
// Dummy version that returns a null string (indicating not implemented).
// Subclasses should override for fast equality checks.
std::string ComputeFingerprint() const override;
// Generic versions that works for all regular types, nested or not.
std::string ComputeMetadataFingerprint() const override;
Type::type id_;
std::vector<std::shared_ptr<Field>> children_;
/// \brief The combination of a field name and data type, with optional metadata
///
/// Fields are used to describe the individual constituents of a
/// nested DataType or a Schema.
///
/// A field's metadata is represented by a KeyValueMetadata instance,
/// which holds arbitrary key-value pairs.
class ARROW_EXPORT Field : public detail::Fingerprintable {
public:
Field(std::string name, std::shared_ptr<DataType> type, bool nullable = true,
std::shared_ptr<const KeyValueMetadata> metadata = NULLPTR)
: detail::Fingerprintable(),
name_(std::move(name)),
type_(std::move(type)),
nullable_(nullable),
metadata_(std::move(metadata)) {}
~Field() override;