Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,580 changes: 1,580 additions & 0 deletions src/paimon/format/orc/orc_adapter.cpp

Large diffs are not rendered by default.

72 changes: 72 additions & 0 deletions src/paimon/format/orc/orc_adapter.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

// Adapted from Apache Arrow
// https://github.com/apache/arrow/blob/main/cpp/src/arrow/adapters/orc/util.h

#pragma once

#include <cstdint>
#include <memory>
#include <string>

#include "arrow/api.h"
#include "arrow/array/array_base.h"
#include "arrow/array/builder_base.h"
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/type_fwd.h"
#include "arrow/util/key_value_metadata.h"
#include "orc/OrcFile.hh"
#include "orc/Type.hh"
#include "paimon/result.h"
#include "paimon/status.h"

namespace orc {
struct ColumnVectorBatch;
} // namespace orc

namespace arrow {
class MemoryPool;
} // namespace arrow

namespace paimon::orc {
class OrcAdapter {
public:
OrcAdapter() = delete;
~OrcAdapter() = delete;

static Result<std::shared_ptr<arrow::DataType>> GetArrowType(const ::orc::Type* type);

static Result<std::unique_ptr<::orc::Type>> GetOrcType(const arrow::Schema& schema);

static Result<std::shared_ptr<const arrow::KeyValueMetadata>> GetFieldMetadata(
const ::orc::Type* type);

static Result<std::shared_ptr<arrow::Field>> GetArrowField(const std::string& name,
const ::orc::Type* type,
bool nullable = true);

static Result<std::shared_ptr<arrow::Array>> AppendBatch(
const std::shared_ptr<arrow::DataType>& type, ::orc::ColumnVectorBatch* batch,
arrow::MemoryPool* pool);

static Status WriteBatch(const std::shared_ptr<arrow::Array>& array,
::orc::ColumnVectorBatch* column_vector_batch);
};
} // namespace paimon::orc
66 changes: 66 additions & 0 deletions src/paimon/format/orc/orc_file_format.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <map>
#include <memory>
#include <string>

#include "arrow/c/bridge.h"
#include "arrow/c/helpers.h"
#include "paimon/common/utils/arrow/status_utils.h"
#include "paimon/format/file_format.h"
#include "paimon/format/orc/orc_reader_builder.h"
#include "paimon/format/orc/orc_stats_extractor.h"
#include "paimon/format/orc/orc_writer_builder.h"

namespace paimon::orc {

class OrcFileFormat : public FileFormat {
public:
explicit OrcFileFormat(const std::map<std::string, std::string>& options)
: identifier_("orc"), options_(options) {}

const std::string& Identifier() const override {
return identifier_;
}

Result<std::unique_ptr<ReaderBuilder>> CreateReaderBuilder(int32_t batch_size) const override {
return std::make_unique<OrcReaderBuilder>(options_, batch_size);
}

Result<std::unique_ptr<WriterBuilder>> CreateWriterBuilder(::ArrowSchema* schema,
int32_t batch_size) const override {
PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr<arrow::Schema> typed_schema,
arrow::ImportSchema(schema));
return std::make_unique<OrcWriterBuilder>(typed_schema, batch_size, options_);
}

Result<std::unique_ptr<FormatStatsExtractor>> CreateStatsExtractor(
::ArrowSchema* schema) const override {
PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr<arrow::Schema> typed_schema,
arrow::ImportSchema(schema));
return std::make_unique<OrcStatsExtractor>(typed_schema);
}

private:
std::string identifier_;
std::map<std::string, std::string> options_;
};
} // namespace paimon::orc
36 changes: 36 additions & 0 deletions src/paimon/format/orc/orc_file_format_factory.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "paimon/format/orc/orc_file_format_factory.h"

#include <utility>

#include "paimon/factories/factory.h"
#include "paimon/format/orc/orc_file_format.h"

namespace paimon::orc {
const char OrcFileFormatFactory::IDENTIFIER[] = "orc";

Result<std::unique_ptr<FileFormat>> OrcFileFormatFactory::Create(
const std::map<std::string, std::string>& options) const {
return std::make_unique<OrcFileFormat>(options);
}

REGISTER_PAIMON_FACTORY(OrcFileFormatFactory);

} // namespace paimon::orc
43 changes: 43 additions & 0 deletions src/paimon/format/orc/orc_file_format_factory.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <map>
#include <memory>
#include <string>

#include "paimon/format/file_format.h"
#include "paimon/format/file_format_factory.h"
#include "paimon/result.h"

namespace paimon::orc {

class OrcFileFormatFactory : public FileFormatFactory {
public:
static const char IDENTIFIER[];

const char* Identifier() const override {
return IDENTIFIER;
}

Result<std::unique_ptr<FileFormat>> Create(
const std::map<std::string, std::string>& options) const override;
};

} // namespace paimon::orc
59 changes: 59 additions & 0 deletions src/paimon/format/orc/orc_format_defs.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <cstddef>
#include <cstdint>

namespace paimon::orc {
// write options
static inline const char ORC_STRIPE_SIZE[] = "orc.stripe.size";
static constexpr size_t DEFAULT_STRIPE_SIZE = 64 * 1024 * 1024;

static inline const char ORC_ROW_INDEX_STRIDE[] = "orc.row.index.stride";
static constexpr size_t DEFAULT_ROW_INDEX_STRIDE = 10000;

static inline const char ORC_COMPRESSION_BLOCK_SIZE[] = "orc.compression.block-size";
static constexpr size_t DEFAULT_COMPRESSION_BLOCK_SIZE = 64 * 1024;

static inline const char ORC_DICTIONARY_KEY_SIZE_THRESHOLD[] = "orc.dictionary-key-size-threshold";
static constexpr double DEFAULT_DICTIONARY_KEY_SIZE_THRESHOLD = 0.8;
// default value of ORC_WRITE_ENABLE_METRICS is false
static inline const char ORC_WRITE_ENABLE_METRICS[] = "orc.write.enable-metrics";
// default value of ORC_TIMESTAMP_LTZ_LEGACY_TYPE is true. This option is used to be compatible with
// the paimon-orc's old behavior for the `timestamp_ltz` data type. Details at
// https://github.com/apache/paimon/issues/5066.
static inline const char ORC_TIMESTAMP_LTZ_LEGACY_TYPE[] = "orc.timestamp-ltz.legacy.type";

// read options
// default value of ORC_READ_ENABLE_LAZY_DECODING is false
static inline const char ORC_READ_ENABLE_LAZY_DECODING[] = "orc.read.enable-lazy-decoding";
static inline const char ORC_NATURAL_READ_SIZE[] = "orc.read.natural-read-size";
static constexpr uint64_t DEFAULT_NATURAL_READ_SIZE = 1024 * 1024;
// default value of ORC_READ_ENABLE_METRICS is false
static inline const char ORC_READ_ENABLE_METRICS[] = "orc.read.enable-metrics";

static constexpr uint64_t MIN_ROW_GROUP_COUNT_IN_ONE_NATURAL_READ = 1;
static inline const char ENABLE_PREFETCH_READ_SIZE_THRESHOLD[] =
"orc.read.enable-prefetch-read-size-threshold";
// Prefetching will not be enabled if the total amount of data queried is below this threshold, as
// prefetching for very small data sets is not beneficial.
static constexpr uint64_t DEFAULT_ENABLE_PREFETCH_READ_SIZE_THRESHOLD = 10ull * 1024 * 1024;

} // namespace paimon::orc
Loading