参考:https://codecheese.wordpress.com/2017/06/13/reading-and-writing-orc-files-using-vectorized-row-batch-in-java/
目标:
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.7</version>
</dependency>
<dependency>
<groupId>org.apache.orc</groupId>
<artifactId>orc-core</artifactId>
<version>1.5.4</version>
</dependency>
package test.test;
import java.util.UUID;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.orc.OrcFile;
import org.apache.orc.TypeDescription;
import org.apache.orc.Writer;
public class WriteOrc {
public static void main(String [ ] args) throws java.io.IOException
{
Configuration conf = new Configuration();
TypeDescription schema = TypeDescription.createStruct()
.addField("long_value", TypeDescription.createLong())
.addField("double_value", TypeDescription.createDouble())
.addField("boolean_value", TypeDescription.createBoolean())
.addField("string_value", TypeDescription.createString());
Writer writer = OrcFile.createWriter(new Path("my-file.orc"),
OrcFile.writerOptions(conf)
.setSchema(schema));
VectorizedRowBatch batch = schema.createRowBatch();
LongColumnVector longVector = (LongColumnVector) batch.cols[0];
DoubleColumnVector doubleVector = (DoubleColumnVector) batch.cols[1];
LongColumnVector booleanVector = (LongColumnVector) batch.cols[2];
BytesColumnVector stringVector = (BytesColumnVector) batch.cols[3];
for(int r=0; r < 100000; ++r) {
int row = batch.size++;
longVector.vector[row] = r;
doubleVector.vector[row] = r;
booleanVector.vector[row] = r< 50000 ? 1 : 0;
stringVector.setVal(row, UUID.randomUUID().toString().getBytes());
if (batch.size == batch.getMaxSize()) {
writer.addRowBatch(batch);
batch.reset();
}
}
if (batch.size != 0) {
writer.addRowBatch(batch);
batch.reset();
}
writer.close();
}
}
package test.test;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf;
import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentFactory;
import org.apache.orc.OrcFile;
import org.apache.orc.Reader;
import org.apache.orc.RecordReader;
import org.apache.orc.TypeDescription;
public class ReadOrc {
public static void main(String [ ] args) throws java.io.IOException
{
Configuration conf = new Configuration();
TypeDescription readSchema = TypeDescription.createStruct()
.addField("long_value", TypeDescription.createLong())
.addField("double_value", TypeDescription.createDouble())
.addField("boolean_value", TypeDescription.createBoolean())
.addField("string_value", TypeDescription.createString());
Reader reader = OrcFile.createReader(new Path("my-file.orc"),
OrcFile.readerOptions(conf));
Reader.Options readerOptions = new Reader.Options(conf)
.searchArgument(
SearchArgumentFactory
.newBuilder()
.between("long_value", PredicateLeaf.Type.LONG, 0L,1024L)
.build(),
new String[]{"long_value"}
);
RecordReader rows = reader.rows(readerOptions.schema(readSchema));
VectorizedRowBatch batch = readSchema.createRowBatch();
while (rows.nextBatch(batch)) {
LongColumnVector longVector = (LongColumnVector) batch.cols[0];
DoubleColumnVector doubleVector = (DoubleColumnVector) batch.cols[1];
LongColumnVector booleanVector = (LongColumnVector) batch.cols[2];
BytesColumnVector stringVector = (BytesColumnVector) batch.cols[3];
for(int r=0; r < batch.size; r++) {
long longValue = longVector.vector[r];
double doubleValue = doubleVector.vector[r];
boolean boolValue = booleanVector.vector[r] != 0;
String stringValue = stringVector.toString(r);
System.out.println(longValue + ", " + doubleValue + ", " + boolValue + ", " + stringValue);
}
}
rows.close();
}
}