apache · lxy-9602 · Jun 25, 2026
diff --git a/LICENSE b/LICENSE
@@ -502,6 +502,16 @@ POSSIBILITY OF SUCH DAMAGE.
 
 --------------------------------------------------------------------------------
 
+This product includes code from LucenePlusPlus.
+
+* LucenePlusPlus utility in src/paimon/global_index/lucene/ directory
+
+Copyright: 2009-2014 Alan Wright.
+Home page: https://github.com/luceneplusplus/LucenePlusPlus
+License: https://www.apache.org/licenses/LICENSE-2.0
+
+--------------------------------------------------------------------------------
+
 This product includes code from cppjieba.
 
 * cppjieba utility in src/paimon/global_index/lucene/ directory

diff --git a/NOTICE b/NOTICE
@@ -29,5 +29,8 @@ Copyright (C) 2012-2023 Yann Collet
 This product includes software from CRoaring project (Apache 2.0)
 Copyright 2016-2022 The CRoaring authors
 
+This product includes software from LucenePlusPlus project (Apache 2.0)
+Copyright 2009-2014 Alan Wright.
+
 This product includes software from cppjieba project (MIT)
 Copyright 2013
diff --git a/src/paimon/global_index/lucene/jieba_analyzer.cpp b/src/paimon/global_index/lucene/jieba_analyzer.cpp
@@ -0,0 +1,153 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "paimon/global_index/lucene/jieba_analyzer.h"
+
+#include "paimon/common/utils/string_utils.h"
+#include "paimon/global_index/lucene/lucene_utils.h"
+
+namespace paimon::lucene {
+JiebaTokenizerContext::JiebaTokenizerContext(const std::string& _tokenize_mode, bool _with_position,
+                                             const std::shared_ptr<cppjieba::Jieba>& _jieba,
+                                             const std::shared_ptr<MemoryPool>& _pool,
+                                             int32_t _buffer_size)
+    : pool(_pool),
+      tokenize_mode(_tokenize_mode),
+      with_position(_with_position),
+      buffer_size(_buffer_size),
+      jieba(_jieba) {}
+
+JiebaTokenizer::JiebaTokenizer(const JiebaTokenizerContext& context, const Lucene::ReaderPtr& input)
+    : Lucene::Tokenizer(input), context_(context) {
+    term_att_ = addAttribute<Lucene::TermAttribute>();
+    pos_att_ = addAttribute<Lucene::PositionIncrementAttribute>();
+    buffer_ = static_cast<wchar_t*>(
+        context_.pool->Malloc(context_.buffer_size * sizeof(wchar_t), /*alignment=*/8));
+}
+
+JiebaTokenizer::~JiebaTokenizer() {
+    if (buffer_) {
+        context_.pool->Free(reinterpret_cast<void*>(buffer_),
+                            context_.buffer_size * sizeof(wchar_t),
+                            /*alignment=*/8);
+        buffer_ = nullptr;
+    }
+}
+
+bool JiebaTokenizer::incrementToken() {
+    if (term_index_ >= normalized_terms_.size()) {
+        return false;
+    }
+
+    const auto& term = normalized_terms_[term_index_++];
+    clearAttributes();
+
+    term_att_->setTermBuffer(LuceneUtils::StringToWstring(term));
+
+    if (context_.with_position) {
+        pos_att_->setPositionIncrement(1);
+    } else {
+        pos_att_->setPositionIncrement(0);
+    }
+    return true;
+}
+
+void JiebaTokenizer::CutWithMode(const std::string& tokenize_mode, const cppjieba::Jieba* jieba,
+                                 const std::string& str, std::vector<std::string>* terms_ptr) {
+    auto& terms = *terms_ptr;
+    if (tokenize_mode == "mp") {
+        jieba->CutSmall(str, terms, /*max_word_len=*/JiebaTokenizerContext::kMaxWordLen);
+    } else if (tokenize_mode == "hmm") {
+        jieba->CutHMM(str, terms);
+    } else if (tokenize_mode == "mix") {
+        jieba->Cut(str, terms, /*hmm=*/true);
+    } else if (tokenize_mode == "full") {
+        jieba->CutAll(str, terms);
+    } else if (tokenize_mode == "query") {
+        jieba->CutForSearch(str, terms, /*hmm=*/true);
+    } else {
+        throw Lucene::IllegalArgumentException(
+            L"only support mp/hmm/mix/full/query in jieba tokenizer");
+    }
+}
+
+void JiebaTokenizer::Normalize(const std::unordered_set<std::string>& stop_words,
+                               std::vector<std::string>* input_ptr,
+                               std::vector<std::string_view>* output_ptr) {
+    auto& input = *input_ptr;
+    auto& output = *output_ptr;
+    output.clear();
+    output.reserve(input.size());
+    for (auto& term : input) {
+        if (StringUtils::IsNullOrWhitespaceOnly(term)) {
+            continue;
+        }
+        // remove stop words
+        if (stop_words.find(term) != stop_words.end()) {
+            continue;
+        }
+        // to lower case
+        bool is_alphanumeric = true;
+        for (const auto& c : term) {
+            if (!std::isalnum(static_cast<unsigned char>(c))) {
+                is_alphanumeric = false;
+                break;
+            }
+        }
+        if (is_alphanumeric && !term.empty()) {
+            std::transform(term.begin(), term.end(), term.begin(), [](char ch) {
+                return static_cast<char>(std::tolower(static_cast<unsigned char>(ch)));
+            });
+        }
+        output.emplace_back(term.data(), term.length());
+    }
+}
+
+void JiebaTokenizer::reset() {
+    Lucene::Tokenizer::reset();
+    InnerReset();
+}
+
+void JiebaTokenizer::reset(const Lucene::ReaderPtr& input) {
+    Lucene::Tokenizer::reset(input);
+    InnerReset();
+}
+
+void JiebaTokenizer::InnerReset() {
+    terms_.clear();
+    normalized_terms_.clear();
+    term_index_ = 0;
+
+    // read wchar from input
+    Lucene::String wstr;
+    wstr.reserve(context_.buffer_size);
+    while (true) {
+        int32_t length = input->read(buffer_, /*offset=*/0, context_.buffer_size);
+        if (length <= 0) {
+            break;
+        }
+        wstr.append(buffer_, length);
+    }
+
+    // jieba tokenize
+    std::string doc_str = LuceneUtils::WstringToString(wstr);
+    // TODO(xinyu.lxy): support porter2 stemmer
+    CutWithMode(context_.tokenize_mode, context_.jieba.get(), doc_str, &terms_);
+    Normalize(context_.jieba->extractor.GetStopWords(), &terms_, &normalized_terms_);
+}
+
+}  // namespace paimon::lucene
diff --git a/src/paimon/global_index/lucene/jieba_analyzer.h b/src/paimon/global_index/lucene/jieba_analyzer.h
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "cppjieba/Jieba.hpp"
+#include "lucene++/LuceneHeaders.h"
+#include "lucene++/MiscUtils.h"
+#include "lucene++/PositionIncrementAttribute.h"
+#include "lucene++/TermAttribute.h"
+#include "paimon/global_index/lucene/lucene_utils.h"
+#include "paimon/memory/memory_pool.h"
+namespace paimon::lucene {
+struct JiebaTokenizerContext {
+    JiebaTokenizerContext(const std::string& _tokenize_mode, bool _with_position,
+                          const std::shared_ptr<cppjieba::Jieba>& _jieba,
+                          const std::shared_ptr<MemoryPool>& _pool,
+                          int32_t _buffer_size = kReadBufferSize);
+
+    std::shared_ptr<MemoryPool> pool;
+    std::string tokenize_mode;
+    bool with_position;
+    int32_t buffer_size;
+    std::shared_ptr<cppjieba::Jieba> jieba;
+
+    static inline const int32_t kReadBufferSize = 5 * 1024 * 1024;
+    static inline const int32_t kMaxWordLen = 1024;
+};
+
+class JiebaTokenizer : public Lucene::Tokenizer {
+ public:
+    JiebaTokenizer(const JiebaTokenizerContext& context, const Lucene::ReaderPtr& input);
+
+    ~JiebaTokenizer() override;
+
+    bool incrementToken() override;
+
+    void reset(const Lucene::ReaderPtr& input) override;
+
+    void reset() override;
+
+    static void CutWithMode(const std::string& tokenize_mode, const cppjieba::Jieba* jieba,
+                            const std::string& str, std::vector<std::string>* terms_ptr);
+
+    // In-place converts each string in `input` to lowercase to avoid data copying.
+    static void Normalize(const std::unordered_set<std::string>& stop_words,
+                          std::vector<std::string>* input, std::vector<std::string_view>* output);
+
+ private:
+    void InnerReset();
+
+ private:
+    JiebaTokenizerContext context_;
+    size_t term_index_ = 0;
+    std::vector<std::string> terms_;
+    std::vector<std::string_view> normalized_terms_;
+    wchar_t* buffer_;
+    Lucene::TermAttributePtr term_att_;
+    Lucene::PositionIncrementAttributePtr pos_att_;
+};
+
+class JiebaAnalyzer : public Lucene::Analyzer {
+ public:
+    explicit JiebaAnalyzer(const JiebaTokenizerContext& context) : context_(context) {}
+
+    ~JiebaAnalyzer() override = default;
+
+    Lucene::TokenStreamPtr tokenStream(const Lucene::String& field_name,
+                                       const Lucene::ReaderPtr& reader) override {
+        return Lucene::newLucene<JiebaTokenizer>(context_, reader);
+    }
+
+ private:
+    JiebaTokenizerContext context_;
+};
+}  // namespace paimon::lucene
diff --git a/src/paimon/global_index/lucene/jieba_analyzer_test.cpp b/src/paimon/global_index/lucene/jieba_analyzer_test.cpp
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "paimon/global_index/lucene/jieba_analyzer.h"
+
+#include "cppjieba/Jieba.hpp"
+#include "gtest/gtest.h"
+#include "lucene++/LuceneHeaders.h"
+#include "paimon/global_index/lucene/lucene_utils.h"
+#include "paimon/memory/memory_pool.h"
+#include "paimon/testing/utils/testharness.h"
+namespace paimon::lucene::test {
+class JiebaAnalyzerTest : public ::testing::Test, public ::testing::WithParamInterface<int32_t> {
+ public:
+    void SetUp() override {}
+    void TearDown() override {}
+    Lucene::TokenStreamPtr CreateJiebaTokenizer(bool with_position) const {
+        return CreateJiebaTokenizer(with_position, L"我爱机器学习");
+    }
+
+    Lucene::TokenStreamPtr CreateJiebaTokenizer(bool with_position,
+                                                const Lucene::String& text) const {
+        auto pool = GetDefaultPool();
+        std::string dictionary_dir = LuceneUtils::GetJiebaDictionaryDir().value();
+        auto jieba = std::make_shared<cppjieba::Jieba>(
+            dictionary_dir + "/jieba.dict.utf8", dictionary_dir + "/hmm_model.utf8",
+            dictionary_dir + "/user.dict.utf8", dictionary_dir + "/idf.utf8",
+            dictionary_dir + "/stop_words.utf8");
+        auto reader = Lucene::newLucene<Lucene::StringReader>(text);
+        int32_t buffer_size = GetParam();
+        JiebaTokenizerContext context(/*tokenize_mode=*/"query", with_position, jieba, pool,
+                                      buffer_size);
+        auto analyzer = Lucene::newLucene<JiebaAnalyzer>(context);
+        return analyzer->tokenStream(/*field_name*/ L"f0", reader);
+    }
+};
+
+TEST_P(JiebaAnalyzerTest, TestSimple) {
+    auto tokenizer = CreateJiebaTokenizer(/*with_position=*/false);
+
+    auto term_att = tokenizer->addAttribute<Lucene::TermAttribute>();
+
+    tokenizer->reset();
+    std::vector<Lucene::String> results;
+    while (tokenizer->incrementToken()) {
+        results.push_back(term_att->term());
+    }
+    tokenizer->end();
+    tokenizer->close();
+    std::vector<Lucene::String> expected = {L"爱", L"机器", L"学习"};
+    ASSERT_EQ(expected, results);
+}
+
+TEST_P(JiebaAnalyzerTest, TestWithPosition) {
+    auto tokenizer = CreateJiebaTokenizer(/*with_position=*/true);
+
+    auto term_att = tokenizer->addAttribute<Lucene::TermAttribute>();
+    auto pos_att = tokenizer->addAttribute<Lucene::PositionIncrementAttribute>();
+
+    tokenizer->reset();
+    std::vector<Lucene::String> results;
+    std::vector<int32_t> result_pos;
+    int32_t pos = 0;
+    while (tokenizer->incrementToken()) {
+        pos += pos_att->getPositionIncrement();
+        result_pos.push_back(pos);
+        results.push_back(term_att->term());
+    }
+    tokenizer->end();
+    tokenizer->close();
+
+    std::vector<Lucene::String> expected = {L"爱", L"机器", L"学习"};
+    std::vector<int32_t> expected_pos = {1, 2, 3};
+    ASSERT_EQ(expected, results);
+    ASSERT_EQ(expected_pos, result_pos);
+}
+
+TEST_P(JiebaAnalyzerTest, TestNormalize) {
+    auto tokenizer = CreateJiebaTokenizer(
+        /*with_position=*/false,
+        L"由于购买了Iphone14，我越来越热爱网上学习了！Happy work, happy day! \n\t");
+
+    auto term_att = tokenizer->addAttribute<Lucene::TermAttribute>();
+
+    tokenizer->reset();
+    std::vector<Lucene::String> results;
+    while (tokenizer->incrementToken()) {
+        results.push_back(term_att->term());
+    }
+    tokenizer->end();
+    tokenizer->close();
+    std::vector<Lucene::String> expected = {L"购买", L"iphone14", L"越来", L"越来越",
+                                            L"热爱", L"网上",     L"学习", L"happy",
+                                            L"work", L"happy",    L"day"};
+    ASSERT_EQ(expected, results);
+}
+
+INSTANTIATE_TEST_SUITE_P(ReadBufferSize, JiebaAnalyzerTest,
+                         ::testing::ValuesIn(std::vector<int32_t>({2, 5, 10, 100})));
+
+}  // namespace paimon::lucene::test