当前位置: 首页 > 工具软件 > Orc > 使用案例 >

JAVA读取ORC文件还原数据

仲元凯
2023-12-01

JAVA读取ORC文件还原数据
踩过坑,笔者快要吐血了,但愿能帮助同行解决问题,废话不多说,来直接上代码。。。。。。。

一,代码示例

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.vector.*;
import org.apache.orc.OrcFile;
import org.apache.orc.Reader;
import org.apache.orc.RecordReader;
import org.apache.orc.TypeDescription;

import java.io.IOException;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

public class ReadOrcUtils {

    private static final int BATCH_SIZE = 1024 * 1024;

    public static List<Map<String, String>> read(Configuration configuration, String path) throws IOException {
        List<Map<String, String>> rows = new LinkedList<>();
        try (Reader reader = OrcFile.createReader(new Path(path), OrcFile.readerOptions(configuration))) {
            TypeDescription schema = reader.getSchema();
            try (RecordReader records = reader.rows(reader.options())) {
                VectorizedRowBatch batch = schema.createRowBatch(BATCH_SIZE);
                List<String> fileNames = schema.getFieldNames();
                while (records.nextBatch(batch)) {
                    for (int rowNum = 0; rowNum < batch.size; rowNum++) {
                        Map<String, String> map = new HashMap<>();
                        for (int i = 0; i < batch.cols.length; i++) {
                            String data = null;
                            ColumnVector columnVector = batch.cols[i];
                            if (columnVector instanceof BytesColumnVector) {
                                BytesColumnVector vector = (BytesColumnVector) columnVector;
                                data = vector.toString(i);
                            } else if (columnVector instanceof DateColumnVector) {
                                DateColumnVector vector = (DateColumnVector) columnVector;
                                data = vector.formatDate(i);
                            } else if (columnVector instanceof Decimal64ColumnVector) {
                                Decimal64ColumnVector vector = (Decimal64ColumnVector) columnVector;
                                data = vector.vector[i] + "";
                            } else if (columnVector instanceof DecimalColumnVector) {
                                DecimalColumnVector vector = (DecimalColumnVector) columnVector;
                                data = vector.vector[i].toString();
                            } else if (columnVector instanceof DoubleColumnVector) {
                                DoubleColumnVector vector = (DoubleColumnVector) columnVector;
                                data = vector.vector[i] + "";
                            } else if (columnVector instanceof ListColumnVector) {
                                ListColumnVector vector = (ListColumnVector) columnVector;
                                StringBuilder builder = new StringBuilder();
                                vector.stringifyValue(builder, i);
                                data = builder.toString();
                            } else if (columnVector instanceof IntervalDayTimeColumnVector) {
                                IntervalDayTimeColumnVector vector = (IntervalDayTimeColumnVector) columnVector;
                                StringBuilder builder = new StringBuilder();
                                vector.stringifyValue(builder, i);
                                data = builder.toString();
                            } else if (columnVector instanceof LongColumnVector) {
                                LongColumnVector vector = (LongColumnVector) columnVector;
                                data = vector.vector[i] + "";
                            } else if (columnVector instanceof MapColumnVector) {
                                MapColumnVector vector = (MapColumnVector) columnVector;
                                StringBuilder builder = new StringBuilder();
                                vector.stringifyValue(builder, i);
                                data = builder.toString();
                            } else if (columnVector instanceof MultiValuedColumnVector) {
                                MultiValuedColumnVector vector = (MultiValuedColumnVector) columnVector;
                                StringBuilder builder = new StringBuilder();
                                vector.stringifyValue(builder, i);
                                data = builder.toString();
                            } else if (columnVector instanceof StructColumnVector) {
                                StructColumnVector vector = (StructColumnVector) columnVector;
                                StringBuilder builder = new StringBuilder();
                                vector.stringifyValue(builder, i);
                                data = builder.toString();
                            } else if (columnVector instanceof TimestampColumnVector) {
                                TimestampColumnVector vector = (TimestampColumnVector) columnVector;
                                data = vector.time[i] + "";
                            } else if (columnVector instanceof UnionColumnVector) {
                                UnionColumnVector vector = (UnionColumnVector) columnVector;
                                StringBuilder builder = new StringBuilder();
                                vector.stringifyValue(builder, i);
                                data = builder.toString();
                            }
                            map.put(fileNames.get(i), data);
                        }
                        rows.add(map);
                    }
                }
            }
        }
        return rows;
    }
}

二,依赖如下

<dependency>
            <groupId>org.apache.orc</groupId>
            <artifactId>orc-core</artifactId>
            <version>1.6.7</version>
            <exclusions>
                <exclusion>
                    <groupId>org.slf4j</groupId>
                    <artifactId>slf4j-api</artifactId>
                </exclusion>
            </exclusions>
        </dependency>
```<dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-core</artifactId>
            <version>1.2.1</version>
            <exclusions>
                <exclusion>
                    <groupId>tomcat</groupId>
                    <artifactId>jasper-runtime</artifactId>
                </exclusion>
                <exclusion>
                    <groupId>tomcat</groupId>
                    <artifactId>jasper-compiler</artifactId>
                </exclusion>
                <exclusion>
                    <groupId>org.mortbay.jetty</groupId>
                    <artifactId>jsp-2.1</artifactId>
                </exclusion>
                <exclusion>
                    <groupId>org.mortbay.jetty</groupId>
                    <artifactId>jsp-api-2.1</artifactId>
                </exclusion>
                <exclusion>
                    <groupId>org.mortbay.jetty</groupId>
                    <artifactId>jetty-util</artifactId>
                </exclusion>
            </exclusions>
        </dependency>


 类似资料: