Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -502,6 +502,16 @@ POSSIBILITY OF SUCH DAMAGE.

--------------------------------------------------------------------------------

This product includes code from LucenePlusPlus.

* LucenePlusPlus utility in src/paimon/global_index/lucene/ directory

Copyright: 2009-2014 Alan Wright.
Home page: https://github.com/luceneplusplus/LucenePlusPlus
License: https://www.apache.org/licenses/LICENSE-2.0

--------------------------------------------------------------------------------

This product includes code from cppjieba.

* cppjieba utility in src/paimon/global_index/lucene/ directory
Expand Down
3 changes: 3 additions & 0 deletions NOTICE
Original file line number Diff line number Diff line change
Expand Up @@ -29,5 +29,8 @@ Copyright (C) 2012-2023 Yann Collet
This product includes software from CRoaring project (Apache 2.0)
Copyright 2016-2022 The CRoaring authors

This product includes software from LucenePlusPlus project (Apache 2.0)
Copyright 2009-2014 Alan Wright.

This product includes software from cppjieba project (MIT)
Copyright 2013
153 changes: 153 additions & 0 deletions src/paimon/global_index/lucene/jieba_analyzer.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "paimon/global_index/lucene/jieba_analyzer.h"

#include "paimon/common/utils/string_utils.h"
#include "paimon/global_index/lucene/lucene_utils.h"

namespace paimon::lucene {
JiebaTokenizerContext::JiebaTokenizerContext(const std::string& _tokenize_mode, bool _with_position,
const std::shared_ptr<cppjieba::Jieba>& _jieba,
const std::shared_ptr<MemoryPool>& _pool,
int32_t _buffer_size)
: pool(_pool),
tokenize_mode(_tokenize_mode),
with_position(_with_position),
buffer_size(_buffer_size),
jieba(_jieba) {}

JiebaTokenizer::JiebaTokenizer(const JiebaTokenizerContext& context, const Lucene::ReaderPtr& input)
: Lucene::Tokenizer(input), context_(context) {
term_att_ = addAttribute<Lucene::TermAttribute>();
pos_att_ = addAttribute<Lucene::PositionIncrementAttribute>();
buffer_ = static_cast<wchar_t*>(
context_.pool->Malloc(context_.buffer_size * sizeof(wchar_t), /*alignment=*/8));
}

JiebaTokenizer::~JiebaTokenizer() {
if (buffer_) {
context_.pool->Free(reinterpret_cast<void*>(buffer_),
context_.buffer_size * sizeof(wchar_t),
/*alignment=*/8);
buffer_ = nullptr;
}
}

bool JiebaTokenizer::incrementToken() {
if (term_index_ >= normalized_terms_.size()) {
return false;
}

const auto& term = normalized_terms_[term_index_++];
clearAttributes();

term_att_->setTermBuffer(LuceneUtils::StringToWstring(term));

if (context_.with_position) {
pos_att_->setPositionIncrement(1);
} else {
pos_att_->setPositionIncrement(0);
}
return true;
}

void JiebaTokenizer::CutWithMode(const std::string& tokenize_mode, const cppjieba::Jieba* jieba,
const std::string& str, std::vector<std::string>* terms_ptr) {
auto& terms = *terms_ptr;
if (tokenize_mode == "mp") {
jieba->CutSmall(str, terms, /*max_word_len=*/JiebaTokenizerContext::kMaxWordLen);
} else if (tokenize_mode == "hmm") {
jieba->CutHMM(str, terms);
} else if (tokenize_mode == "mix") {
jieba->Cut(str, terms, /*hmm=*/true);
} else if (tokenize_mode == "full") {
jieba->CutAll(str, terms);
} else if (tokenize_mode == "query") {
jieba->CutForSearch(str, terms, /*hmm=*/true);
} else {
throw Lucene::IllegalArgumentException(
L"only support mp/hmm/mix/full/query in jieba tokenizer");
}
}

void JiebaTokenizer::Normalize(const std::unordered_set<std::string>& stop_words,
std::vector<std::string>* input_ptr,
std::vector<std::string_view>* output_ptr) {
auto& input = *input_ptr;
auto& output = *output_ptr;
output.clear();
output.reserve(input.size());
for (auto& term : input) {
if (StringUtils::IsNullOrWhitespaceOnly(term)) {
continue;
}
// remove stop words
if (stop_words.find(term) != stop_words.end()) {
continue;
}
// to lower case
bool is_alphanumeric = true;
for (const auto& c : term) {
if (!std::isalnum(static_cast<unsigned char>(c))) {
is_alphanumeric = false;
break;
}
}
if (is_alphanumeric && !term.empty()) {
std::transform(term.begin(), term.end(), term.begin(), [](char ch) {
return static_cast<char>(std::tolower(static_cast<unsigned char>(ch)));
});
}
output.emplace_back(term.data(), term.length());
}
}

void JiebaTokenizer::reset() {
Lucene::Tokenizer::reset();
InnerReset();
}

void JiebaTokenizer::reset(const Lucene::ReaderPtr& input) {
Lucene::Tokenizer::reset(input);
InnerReset();
}

void JiebaTokenizer::InnerReset() {
terms_.clear();
normalized_terms_.clear();
term_index_ = 0;

// read wchar from input
Lucene::String wstr;
wstr.reserve(context_.buffer_size);
while (true) {
int32_t length = input->read(buffer_, /*offset=*/0, context_.buffer_size);
if (length <= 0) {
break;
}
wstr.append(buffer_, length);
}

// jieba tokenize
std::string doc_str = LuceneUtils::WstringToString(wstr);
// TODO(xinyu.lxy): support porter2 stemmer
CutWithMode(context_.tokenize_mode, context_.jieba.get(), doc_str, &terms_);
Normalize(context_.jieba->extractor.GetStopWords(), &terms_, &normalized_terms_);
}

} // namespace paimon::lucene
89 changes: 89 additions & 0 deletions src/paimon/global_index/lucene/jieba_analyzer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "cppjieba/Jieba.hpp"
#include "lucene++/LuceneHeaders.h"
#include "lucene++/MiscUtils.h"
#include "lucene++/PositionIncrementAttribute.h"
#include "lucene++/TermAttribute.h"
#include "paimon/global_index/lucene/lucene_utils.h"
#include "paimon/memory/memory_pool.h"
namespace paimon::lucene {
struct JiebaTokenizerContext {
JiebaTokenizerContext(const std::string& _tokenize_mode, bool _with_position,
const std::shared_ptr<cppjieba::Jieba>& _jieba,
const std::shared_ptr<MemoryPool>& _pool,
int32_t _buffer_size = kReadBufferSize);

std::shared_ptr<MemoryPool> pool;
std::string tokenize_mode;
bool with_position;
int32_t buffer_size;
std::shared_ptr<cppjieba::Jieba> jieba;

static inline const int32_t kReadBufferSize = 5 * 1024 * 1024;
static inline const int32_t kMaxWordLen = 1024;
};

class JiebaTokenizer : public Lucene::Tokenizer {
public:
JiebaTokenizer(const JiebaTokenizerContext& context, const Lucene::ReaderPtr& input);

~JiebaTokenizer() override;

bool incrementToken() override;

void reset(const Lucene::ReaderPtr& input) override;

void reset() override;

static void CutWithMode(const std::string& tokenize_mode, const cppjieba::Jieba* jieba,
const std::string& str, std::vector<std::string>* terms_ptr);

// In-place converts each string in `input` to lowercase to avoid data copying.
static void Normalize(const std::unordered_set<std::string>& stop_words,
std::vector<std::string>* input, std::vector<std::string_view>* output);

private:
void InnerReset();

private:
JiebaTokenizerContext context_;
size_t term_index_ = 0;
std::vector<std::string> terms_;
std::vector<std::string_view> normalized_terms_;
wchar_t* buffer_;
Lucene::TermAttributePtr term_att_;
Lucene::PositionIncrementAttributePtr pos_att_;
};

class JiebaAnalyzer : public Lucene::Analyzer {
public:
explicit JiebaAnalyzer(const JiebaTokenizerContext& context) : context_(context) {}

~JiebaAnalyzer() override = default;

Lucene::TokenStreamPtr tokenStream(const Lucene::String& field_name,
const Lucene::ReaderPtr& reader) override {
return Lucene::newLucene<JiebaTokenizer>(context_, reader);
}

private:
JiebaTokenizerContext context_;
};
} // namespace paimon::lucene
115 changes: 115 additions & 0 deletions src/paimon/global_index/lucene/jieba_analyzer_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "paimon/global_index/lucene/jieba_analyzer.h"

#include "cppjieba/Jieba.hpp"
#include "gtest/gtest.h"
#include "lucene++/LuceneHeaders.h"
#include "paimon/global_index/lucene/lucene_utils.h"
#include "paimon/memory/memory_pool.h"
#include "paimon/testing/utils/testharness.h"
namespace paimon::lucene::test {
class JiebaAnalyzerTest : public ::testing::Test, public ::testing::WithParamInterface<int32_t> {
public:
void SetUp() override {}
void TearDown() override {}
Lucene::TokenStreamPtr CreateJiebaTokenizer(bool with_position) const {
return CreateJiebaTokenizer(with_position, L"我爱机器学习");
}

Lucene::TokenStreamPtr CreateJiebaTokenizer(bool with_position,
const Lucene::String& text) const {
auto pool = GetDefaultPool();
std::string dictionary_dir = LuceneUtils::GetJiebaDictionaryDir().value();
auto jieba = std::make_shared<cppjieba::Jieba>(
dictionary_dir + "/jieba.dict.utf8", dictionary_dir + "/hmm_model.utf8",
dictionary_dir + "/user.dict.utf8", dictionary_dir + "/idf.utf8",
dictionary_dir + "/stop_words.utf8");
auto reader = Lucene::newLucene<Lucene::StringReader>(text);
int32_t buffer_size = GetParam();
JiebaTokenizerContext context(/*tokenize_mode=*/"query", with_position, jieba, pool,
buffer_size);
auto analyzer = Lucene::newLucene<JiebaAnalyzer>(context);
return analyzer->tokenStream(/*field_name*/ L"f0", reader);
}
};

TEST_P(JiebaAnalyzerTest, TestSimple) {
auto tokenizer = CreateJiebaTokenizer(/*with_position=*/false);

auto term_att = tokenizer->addAttribute<Lucene::TermAttribute>();

tokenizer->reset();
std::vector<Lucene::String> results;
while (tokenizer->incrementToken()) {
results.push_back(term_att->term());
}
tokenizer->end();
tokenizer->close();
std::vector<Lucene::String> expected = {L"爱", L"机器", L"学习"};
ASSERT_EQ(expected, results);
}

TEST_P(JiebaAnalyzerTest, TestWithPosition) {
auto tokenizer = CreateJiebaTokenizer(/*with_position=*/true);

auto term_att = tokenizer->addAttribute<Lucene::TermAttribute>();
auto pos_att = tokenizer->addAttribute<Lucene::PositionIncrementAttribute>();

tokenizer->reset();
std::vector<Lucene::String> results;
std::vector<int32_t> result_pos;
int32_t pos = 0;
while (tokenizer->incrementToken()) {
pos += pos_att->getPositionIncrement();
result_pos.push_back(pos);
results.push_back(term_att->term());
}
tokenizer->end();
tokenizer->close();

std::vector<Lucene::String> expected = {L"爱", L"机器", L"学习"};
std::vector<int32_t> expected_pos = {1, 2, 3};
ASSERT_EQ(expected, results);
ASSERT_EQ(expected_pos, result_pos);
}

TEST_P(JiebaAnalyzerTest, TestNormalize) {
auto tokenizer = CreateJiebaTokenizer(
/*with_position=*/false,
L"由于购买了Iphone14,我越来越热爱网上学习了!Happy work, happy day! \n\t");

auto term_att = tokenizer->addAttribute<Lucene::TermAttribute>();

tokenizer->reset();
std::vector<Lucene::String> results;
while (tokenizer->incrementToken()) {
results.push_back(term_att->term());
}
tokenizer->end();
tokenizer->close();
std::vector<Lucene::String> expected = {L"购买", L"iphone14", L"越来", L"越来越",
L"热爱", L"网上", L"学习", L"happy",
L"work", L"happy", L"day"};
ASSERT_EQ(expected, results);
}

INSTANTIATE_TEST_SUITE_P(ReadBufferSize, JiebaAnalyzerTest,
::testing::ValuesIn(std::vector<int32_t>({2, 5, 10, 100})));

} // namespace paimon::lucene::test
Loading