Orc文件存储的时候是以列存储的,数据格式支持List,Map,Struct,Decimal、基础数据类型(PrimitiveTypeInfo)。
pom.xml文件引入
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>3.2.0</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-jobclient</artifactId>
<version>3.2.0</version>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-jdbc</artifactId>
<version>3.1.2</version>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>3.1.2</version>
</dependency>
ORC数据类型分为基础数据类型和Group数据类型
其中 true为基础数据类型,false为结构数据类型
BOOLEAN("boolean", true),
BYTE("tinyint", true),
SHORT("smallint", true),
INT("int", true),
LONG("bigint", true),
FLOAT("float", true),
DOUBLE("double", true),
STRING("string", true),
DATE("date", true),
TIMESTAMP("timestamp", true),
BINARY("binary", true),
DECIMAL("decimal", true),
VARCHAR("varchar", true),
CHAR("char", true),
LIST("array", false),
MAP("map", false),
STRUCT("struct", false),
UNION("uniontype", false),
TIMESTAMP_INSTANT("timestamp with local time zone", true);
代码如下
import org.apache.orc.TypeDescription;
public class OrcDataSchema {
public static TypeDescription createBoolean() {
return new TypeDescription(TypeDescription.Category.BOOLEAN);
}
public static TypeDescription createByte() {
return new TypeDescription(TypeDescription.Category.BYTE);
}
public static TypeDescription createShort() {
return new TypeDescription(TypeDescription.Category.SHORT);
}
public static TypeDescription createInt() {
return new TypeDescription(TypeDescription.Category.INT);
}
public static TypeDescription createLong() {
return new TypeDescription(TypeDescription.Category.LONG);
}
public static TypeDescription createFloat() {
return new TypeDescription(TypeDescription.Category.FLOAT);
}
public static TypeDescription createDouble() {
return new TypeDescription(TypeDescription.Category.DOUBLE);
}
public static TypeDescription createString() {
return new TypeDescription(TypeDescription.Category.STRING);
}
public static TypeDescription createDate() {
return new TypeDescription(TypeDescription.Category.DATE);
}
public static TypeDescription createTimestamp() {
return new TypeDescription(TypeDescription.Category.TIMESTAMP);
}
public static TypeDescription createTimestampInstant() {
return new TypeDescription(TypeDescription.Category.TIMESTAMP_INSTANT);
}
public static TypeDescription createBinary() {
return new TypeDescription(TypeDescription.Category.BINARY);
}
/**
* DECIMAL 默认 precision:38,scale:10
*/
public static TypeDescription createDecimal() {
return new TypeDescription(TypeDescription.Category.DECIMAL);
}
/**
* DECIMAL 设置 precision,scale
*/
public static TypeDescription createDecimal(int precision, int scale) {
return new TypeDescription(TypeDescription.Category.DECIMAL).withPrecision(precision).withScale(scale);
}
/**
* VARCHAR 默认长度256
*/
public static TypeDescription createVarchar() {
return new TypeDescription(TypeDescription.Category.VARCHAR);
}
/**
* VARCHAR 设置长度
*/
public static TypeDescription createVarchar(int maxLength) {
return new TypeDescription(TypeDescription.Category.VARCHAR).withMaxLength(maxLength);
}
/**
* CHAR 默认长度256
*/
public static TypeDescription createChar() {
return new TypeDescription(TypeDescription.Category.CHAR);
}
/**
* CHAR 设置长度
*/
public static TypeDescription createChar(int maxLength) {
return new TypeDescription(TypeDescription.Category.CHAR).withMaxLength(maxLength);
}
public static TypeDescription createList(TypeDescription childType) {
return TypeDescription.createList(childType);
}
public static TypeDescription createMap(TypeDescription keyType, TypeDescription valueType) {
return TypeDescription.createMap(keyType,valueType);
}
public static TypeDescription createUnion() {
return new TypeDescription(TypeDescription.Category.UNION);
}
public static TypeDescription createStruct() {
return new TypeDescription(TypeDescription.Category.STRUCT);
}
}
代码如下(示例):
package com.study.spark.mr.utils;
import org.apache.hadoop.hive.serde2.io.DateWritable;
import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
import org.apache.hadoop.io.*;
import org.apache.orc.TypeDescription;
import org.apache.orc.mapred.*;
import java.sql.Date;
public class OrcDataWrite {
public static BooleanWritable booleanWritable(boolean value){
return new BooleanWritable(value);
}
public static ByteWritable byteWritable(byte value){
return new ByteWritable(value);
}
public static ShortWritable shortWritable(short value){
return new ShortWritable(value);
}
public static IntWritable intWritable(int value){
return new IntWritable(value);
}
public static LongWritable longWritable(long value){
return new LongWritable(value);
}
public static FloatWritable floatWritable(float value){
return new FloatWritable(value);
}
public static DoubleWritable doubleWritable(double value){
return new DoubleWritable(value);
}
public static BytesWritable bytesWritable(byte[] value){
return new BytesWritable(value);
}
public static Text stringWritable(String value){
return new Text(value);
}
public static Text varcharWritable(String value){
return new Text(value);
}
public static Text charWritable(String value){
return new Text(value);
}
public static DateWritable dateWritable(Date value){
return new DateWritable(value);
}
public static OrcTimestamp timestampWritable(long value){
return new OrcTimestamp(value);
}
public static HiveDecimalWritable decimalWritable(String value){
return new HiveDecimalWritable(value);
}
public static OrcList listWritable(TypeDescription type){
return new OrcList(type);
}
public static OrcMap mapWritable(TypeDescription type){
return new OrcMap(type);
}
public static OrcUnion unionWritable(TypeDescription type){
return new OrcUnion(type);
}
}
代码如下
package com.study.spark.mr.utils;
import org.apache.hadoop.hive.serde2.io.DateWritable;
import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
import org.apache.hadoop.io.*;
import org.apache.orc.TypeDescription;
import org.apache.orc.mapred.*;
import java.sql.Date;
import java.util.Iterator;
import java.util.List;
public class OrcDataTest {
public static void main(String[] args){
TypeDescription typeDescription = getTypeDescription();
System.out.println(typeDescription.toString());
dispay(typeDescription,"");
write(typeDescription);
}
/**
* 示例中ORC数据格式设置
*/
public static TypeDescription getTypeDescription(){
TypeDescription description = OrcDataSchema.createStruct();
description.addField("name",OrcDataSchema.createString());
description.addField("money",OrcDataSchema.createDecimal(22,4));
description.addField("list",OrcDataSchema.createList(OrcDataSchema.createInt()));
description.addField("map",OrcDataSchema.createMap(OrcDataSchema.createString(),OrcDataSchema.createInt()));
return description;
}
/**
* 示例中ORC数据格式分析
*/
public static void dispay(TypeDescription description, String field){
System.out.println("type = "+description.getCategory().getName()+" field = "+ field);
if(!description.getCategory().isPrimitive()){
if("STRUCT".equals(description.getCategory().getName())){
List<TypeDescription> descriptions = description.getChildren();
List<String> fields = description.getFieldNames();
for(int i = 0; i < descriptions.size(); i++){
dispay(descriptions.get(i),fields.get(i));
}
}else {
List<TypeDescription> descriptions = description.getChildren();
for(int i = 0; i < descriptions.size(); i++){
dispay(descriptions.get(i),"");
}
}
}
}
/**
* 示例中ORC数据写入,对应不同的格式
*/
public static OrcStruct write(TypeDescription description){
OrcStruct orcStruct = new OrcStruct(description);
List<TypeDescription> children = description.getChildren();
List<String> fields = description.getFieldNames();
for(int i = 0; i < children.size(); i++){
WritableComparable writableComparable = null;
//这是值,可以根据存入的数据, 来获取。
//可以通过 i 获取,也可以通过 字段名称获取数据
switch(children.get(i).getCategory()) {
case BOOLEAN:
writableComparable = OrcDataWrite.booleanWritable(false);
break;
case BYTE:
writableComparable = OrcDataWrite.byteWritable((byte)1);
break;
case SHORT:
writableComparable = OrcDataWrite.shortWritable((short) 2);
break;
case INT:
writableComparable = OrcDataWrite.intWritable( 2);
break;
case LONG:
writableComparable = OrcDataWrite.longWritable( 2L);
break;
case FLOAT:
writableComparable = OrcDataWrite.floatWritable( 2.0F);
break;
case DOUBLE:
writableComparable = OrcDataWrite.doubleWritable( 2.0D);
break;
case BINARY:
writableComparable = OrcDataWrite.bytesWritable( new byte[0]);
break;
case CHAR:
case VARCHAR:
case STRING:
writableComparable = OrcDataWrite.stringWritable("value");
break;
case DATE:
writableComparable = OrcDataWrite.dateWritable(new Date(System.currentTimeMillis()));
break;
case TIMESTAMP:
case TIMESTAMP_INSTANT:
writableComparable = OrcDataWrite.timestampWritable(System.currentTimeMillis());
break;
case DECIMAL:
writableComparable = OrcDataWrite.decimalWritable("20.01");
break;
case STRUCT:
writableComparable = write(children.get(i));
break;
case LIST:
OrcList<WritableComparable> list = OrcDataWrite.listWritable(children.get(i));
TypeDescription sub = children.get(i).getChildren().get(0);
//具体操作具体来
if("INT".equals(sub.getCategory().getName())){
list.add(OrcDataWrite.intWritable(1));
list.add(OrcDataWrite.intWritable(1));
list.add(OrcDataWrite.intWritable(1));
}
writableComparable = list;
break;
case MAP:
OrcMap<WritableComparable,WritableComparable> map = OrcDataWrite.mapWritable(children.get(i));
TypeDescription key = children.get(i).getChildren().get(0);
TypeDescription value = children.get(i).getChildren().get(1);
//具体需要判断是什么类型数据
//这里测试就直接使用
map.put(OrcDataWrite.stringWritable("key"),OrcDataWrite.intWritable(1));
writableComparable = map;
break;
default:
throw new IllegalArgumentException("Unknown type " + children.get(i));
}
orcStruct.setFieldValue(i,writableComparable);
}
return orcStruct;
}
}