JAVA读取ORC文件还原数据
踩过坑,笔者快要吐血了,但愿能帮助同行解决问题,废话不多说,来直接上代码。。。。。。。
一,代码示例
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.vector.*;
import org.apache.orc.OrcFile;
import org.apache.orc.Reader;
import org.apache.orc.RecordReader;
import org.apache.orc.TypeDescription;
import java.io.IOException;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
public class ReadOrcUtils {
private static final int BATCH_SIZE = 1024 * 1024;
public static List<Map<String, String>> read(Configuration configuration, String path) throws IOException {
List<Map<String, String>> rows = new LinkedList<>();
try (Reader reader = OrcFile.createReader(new Path(path), OrcFile.readerOptions(configuration))) {
TypeDescription schema = reader.getSchema();
try (RecordReader records = reader.rows(reader.options())) {
VectorizedRowBatch batch = schema.createRowBatch(BATCH_SIZE);
List<String> fileNames = schema.getFieldNames();
while (records.nextBatch(batch)) {
for (int rowNum = 0; rowNum < batch.size; rowNum++) {
Map<String, String> map = new HashMap<>();
for (int i = 0; i < batch.cols.length; i++) {
String data = null;
ColumnVector columnVector = batch.cols[i];
if (columnVector instanceof BytesColumnVector) {
BytesColumnVector vector = (BytesColumnVector) columnVector;
data = vector.toString(i);
} else if (columnVector instanceof DateColumnVector) {
DateColumnVector vector = (DateColumnVector) columnVector;
data = vector.formatDate(i);
} else if (columnVector instanceof Decimal64ColumnVector) {
Decimal64ColumnVector vector = (Decimal64ColumnVector) columnVector;
data = vector.vector[i] + "";
} else if (columnVector instanceof DecimalColumnVector) {
DecimalColumnVector vector = (DecimalColumnVector) columnVector;
data = vector.vector[i].toString();
} else if (columnVector instanceof DoubleColumnVector) {
DoubleColumnVector vector = (DoubleColumnVector) columnVector;
data = vector.vector[i] + "";
} else if (columnVector instanceof ListColumnVector) {
ListColumnVector vector = (ListColumnVector) columnVector;
StringBuilder builder = new StringBuilder();
vector.stringifyValue(builder, i);
data = builder.toString();
} else if (columnVector instanceof IntervalDayTimeColumnVector) {
IntervalDayTimeColumnVector vector = (IntervalDayTimeColumnVector) columnVector;
StringBuilder builder = new StringBuilder();
vector.stringifyValue(builder, i);
data = builder.toString();
} else if (columnVector instanceof LongColumnVector) {
LongColumnVector vector = (LongColumnVector) columnVector;
data = vector.vector[i] + "";
} else if (columnVector instanceof MapColumnVector) {
MapColumnVector vector = (MapColumnVector) columnVector;
StringBuilder builder = new StringBuilder();
vector.stringifyValue(builder, i);
data = builder.toString();
} else if (columnVector instanceof MultiValuedColumnVector) {
MultiValuedColumnVector vector = (MultiValuedColumnVector) columnVector;
StringBuilder builder = new StringBuilder();
vector.stringifyValue(builder, i);
data = builder.toString();
} else if (columnVector instanceof StructColumnVector) {
StructColumnVector vector = (StructColumnVector) columnVector;
StringBuilder builder = new StringBuilder();
vector.stringifyValue(builder, i);
data = builder.toString();
} else if (columnVector instanceof TimestampColumnVector) {
TimestampColumnVector vector = (TimestampColumnVector) columnVector;
data = vector.time[i] + "";
} else if (columnVector instanceof UnionColumnVector) {
UnionColumnVector vector = (UnionColumnVector) columnVector;
StringBuilder builder = new StringBuilder();
vector.stringifyValue(builder, i);
data = builder.toString();
}
map.put(fileNames.get(i), data);
}
rows.add(map);
}
}
}
}
return rows;
}
}
二,依赖如下
<dependency>
<groupId>org.apache.orc</groupId>
<artifactId>orc-core</artifactId>
<version>1.6.7</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
</exclusion>
</exclusions>
</dependency>
```<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-core</artifactId>
<version>1.2.1</version>
<exclusions>
<exclusion>
<groupId>tomcat</groupId>
<artifactId>jasper-runtime</artifactId>
</exclusion>
<exclusion>
<groupId>tomcat</groupId>
<artifactId>jasper-compiler</artifactId>
</exclusion>
<exclusion>
<groupId>org.mortbay.jetty</groupId>
<artifactId>jsp-2.1</artifactId>
</exclusion>
<exclusion>
<groupId>org.mortbay.jetty</groupId>
<artifactId>jsp-api-2.1</artifactId>
</exclusion>
<exclusion>
<groupId>org.mortbay.jetty</groupId>
<artifactId>jetty-util</artifactId>
</exclusion>
</exclusions>
</dependency>