当前位置: 首页 > 工具软件 > Apache ORC > 使用案例 >

ORC 文件层 API 读写

祝英博
2023-12-01

参考:https://codecheese.wordpress.com/2017/06/13/reading-and-writing-orc-files-using-vectorized-row-batch-in-java/

目标:

  • orc 各种数据类型写入
  • orc 查询,带过滤条件,带投影

pom 依赖

	<dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-client</artifactId>
      <version>2.7.7</version>
    </dependency>
    <dependency>
      <groupId>org.apache.orc</groupId>
      <artifactId>orc-core</artifactId>
      <version>1.5.4</version>
    </dependency>

ORC 写入

package test.test;

import java.util.UUID;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.orc.OrcFile;
import org.apache.orc.TypeDescription;
import org.apache.orc.Writer;

public class WriteOrc {

  public static void main(String [ ] args) throws java.io.IOException
  {
    Configuration conf = new Configuration();
    TypeDescription schema = TypeDescription.createStruct()
        .addField("long_value", TypeDescription.createLong())
        .addField("double_value", TypeDescription.createDouble())
        .addField("boolean_value", TypeDescription.createBoolean())
        .addField("string_value", TypeDescription.createString());

    Writer writer = OrcFile.createWriter(new Path("my-file.orc"),
        OrcFile.writerOptions(conf)
            .setSchema(schema));


    VectorizedRowBatch batch = schema.createRowBatch();
    LongColumnVector longVector = (LongColumnVector) batch.cols[0];
    DoubleColumnVector doubleVector = (DoubleColumnVector) batch.cols[1];
    LongColumnVector booleanVector = (LongColumnVector) batch.cols[2];
    BytesColumnVector stringVector = (BytesColumnVector) batch.cols[3];


    for(int r=0; r < 100000; ++r) {
      int row = batch.size++;

      longVector.vector[row] = r;
      doubleVector.vector[row] = r;
      booleanVector.vector[row] =  r< 50000 ? 1 : 0;
      stringVector.setVal(row, UUID.randomUUID().toString().getBytes());

      if (batch.size == batch.getMaxSize()) {
        writer.addRowBatch(batch);
        batch.reset();
      }
    }
    if (batch.size != 0) {
      writer.addRowBatch(batch);
      batch.reset();
    }
    writer.close();
  }


}

ORC 投影、过滤

package test.test;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf;
import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentFactory;
import org.apache.orc.OrcFile;
import org.apache.orc.Reader;
import org.apache.orc.RecordReader;
import org.apache.orc.TypeDescription;

public class ReadOrc {

  public static void main(String [ ] args) throws java.io.IOException
  {
    Configuration conf = new Configuration();

    TypeDescription readSchema = TypeDescription.createStruct()
        .addField("long_value", TypeDescription.createLong())
        .addField("double_value", TypeDescription.createDouble())
        .addField("boolean_value", TypeDescription.createBoolean())
        .addField("string_value", TypeDescription.createString());


    Reader reader = OrcFile.createReader(new Path("my-file.orc"),
        OrcFile.readerOptions(conf));

    Reader.Options readerOptions = new Reader.Options(conf)
        .searchArgument(
            SearchArgumentFactory
                .newBuilder()
                .between("long_value", PredicateLeaf.Type.LONG, 0L,1024L)
                .build(),
            new String[]{"long_value"}
        );

    RecordReader rows = reader.rows(readerOptions.schema(readSchema));

    VectorizedRowBatch batch = readSchema.createRowBatch();

    while (rows.nextBatch(batch)) {
      LongColumnVector longVector = (LongColumnVector) batch.cols[0];
      DoubleColumnVector doubleVector  = (DoubleColumnVector) batch.cols[1];
      LongColumnVector booleanVector = (LongColumnVector) batch.cols[2];
      BytesColumnVector stringVector = (BytesColumnVector)  batch.cols[3];


      for(int r=0; r < batch.size; r++) {
        long longValue = longVector.vector[r];
        double doubleValue = doubleVector.vector[r];
        boolean boolValue = booleanVector.vector[r] != 0;
        String stringValue = stringVector.toString(r);

        System.out.println(longValue + ", " + doubleValue + ", " + boolValue + ", " + stringValue);

      }
    }
    rows.close();
  }

}

 类似资料: