apache · nishant94 · Jun 19, 2026 · Jun 22, 2026 · Jun 22, 2026 · Jun 22, 2026
diff --git a/.clang-format-ignore b/.clang-format-ignore
@@ -9,4 +9,5 @@ be/src/util/sse2neon.h
 be/src/util/mustache/mustache.h
 be/src/util/mustache/mustache.cc
 be/src/util/utf8_check.cpp
+be/src/storage/index/inverted/analyzer/kuromoji/dict/darts.h
 cloud/src/common/defer.h
diff --git a/.gitignore b/.gitignore
@@ -153,3 +153,6 @@ compile_commands.json
 .github
 
 .worktrees/
+
+# generated kuromoji dictionary binaries
+/be/dict/kuromoji/*.bin
diff --git a/.licenserc.yaml b/.licenserc.yaml
@@ -80,6 +80,7 @@ header:
     - "be/src/util/sse2neo.h"
     - "be/src/util/sse2neon.h"
     - "be/src/util/utf8_check.cpp"
+    - "be/src/storage/index/inverted/analyzer/kuromoji/dict/darts.h"
     - "be/src/pch/*"
     - "be/test/data"
     - "be/test/expected_result"

diff --git a/NOTICE.txt b/NOTICE.txt
@@ -73,6 +73,9 @@ This software includes third party software subject to the following copyrights:
 - Netty Reactive Streams - https://github.com/playframework/netty-reactive-streams
 - Jackson-core - https://github.com/FasterXML/jackson-core
 - Jackson-dataformat-cbor - https://github.com/FasterXML/jackson-dataformats-binary
+- Darts-clone (double-array trie) - Copyright 2008-2014 Susumu Yata - https://github.com/s-yata/darts-clone (BSD 2-clause; see dist/licenses/LICENSE-darts-clone.txt)
+- mecab-ipadic (IPADIC) Japanese morphological dictionary - Copyright 2000-2003 Nara Institute of Science and Technology (NAIST) - licensed under NAIST-2003 (BSD-style); the kuromoji analyzer bundles the UTF-8 form from https://github.com/lindera/mecab-ipadic (content of mecab-ipadic-2.7.0-20070801). See dist/licenses/LICENSE-ipadic.txt.
+- Apache Lucene - https://github.com/apache/lucene (Apache-2.0): the kuromoji Japanese analyzer under be/src/storage/index/inverted/analyzer/kuromoji is an independent C++ implementation modeled on Lucene's kuromoji analyzer (JapaneseTokenizer), including its search-mode compound-decomposition cost model.
 
 The licenses for these third party components are included in LICENSE.txt
 

@@ -313,6 +313,12 @@ install(DIRECTORY
     ${BASE_DIR}/dict/pinyin
     DESTINATION ${OUTPUT_DIR}/dict)
 
+# Japanese kuromoji dictionary
+install(DIRECTORY
+    ${BASE_DIR}/dict/kuromoji
+    DESTINATION ${OUTPUT_DIR}/dict
+    OPTIONAL)
+
 # Check if functions are supported in this platform. All flags will generated
 # in gensrc/build/common/env_config.h.
 # You can check funcion here which depends on platform. Don't forget add this

diff --git a/be/dict/kuromoji/README.md b/be/dict/kuromoji/README.md
@@ -0,0 +1,36 @@
+# Kuromoji (Japanese) dictionary
+
+This directory holds the compiled IPADIC dictionary consumed at runtime by the
+`kuromoji` inverted-index analyzer (`KuromojiAnalyzer` → `KuromojiDictionary`):
+
+- `system.bin`  — surface→word Darts trie + word entries + feature blob
+- `matrix.bin`  — connection-cost matrix (1316×1316)
+- `chardef.bin` — character-category map + per-category flags
+- `unkdict.bin` — unknown-word entries per category
+
+These `*.bin` files are **generated** (not committed; see `.gitignore`). The
+runtime resolves them at `${inverted_index_dict_path}/kuromoji`
+(default `${DORIS_HOME}/dict/kuromoji`); `be/CMakeLists.txt` installs this
+directory into the BE package.
+
+## How it's (re)generated
+
+Source: the UTF-8 IPADIC from <https://github.com/lindera/mecab-ipadic>
+(tag `2.7.0-20250920`) — the original `mecab-ipadic-2.7.0-20070801` lexicon
+converted to UTF-8 (license: NAIST-2003, see `dist/licenses/LICENSE-ipadic.txt`).
+
+Automated, two steps:
+
+```bash
+# 1. thirdparty fetches + stages the UTF-8 IPADIC source into
+#    ${DORIS_THIRDPARTY}/installed/share/mecab-ipadic-2.7.0-20250920
+sh thirdparty/build-thirdparty.sh mecab_ipadic
+
+# 2. the CMake target builds the offline compiler and produces the *.bin here
+ninja -C be/ut_build_RELEASE kuromoji_dict
+```
+
+CI/release should run `ninja kuromoji_dict` before packaging; the BE `install`
+rule then ships this directory. Override the source dir with
+`-DKUROMOJI_IPADIC_SRC=<path>` at CMake configure time. (The tool can also be
+run directly: `kuromoji_build_dict <utf8_ipadic_src_dir> be/dict/kuromoji`.)
diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp
@@ -1287,6 +1287,8 @@ DEFINE_mDouble(inverted_index_ram_buffer_size, "512");
 DEFINE_mInt32(inverted_index_max_buffered_docs, "-1");
 // dict path for chinese analyzer
 DEFINE_String(inverted_index_dict_path, "${DORIS_HOME}/dict");
+// The kuromoji (Japanese) analyzer
+DEFINE_mBool(enable_kuromoji_analyzer, "false");
 DEFINE_Int32(inverted_index_read_buffer_size, "4096");
 // tree depth for bkd index
 DEFINE_Int32(max_depth_in_bkd_tree, "32");

diff --git a/be/src/common/config.h b/be/src/common/config.h
@@ -1329,6 +1329,8 @@ DECLARE_mDouble(inverted_index_ram_buffer_size);
 DECLARE_mInt32(inverted_index_max_buffered_docs);
 // dict path for chinese analyzer
 DECLARE_String(inverted_index_dict_path);
+// The kuromoji (Japanese) analyzer
+DECLARE_mBool(enable_kuromoji_analyzer);
 DECLARE_Int32(inverted_index_read_buffer_size);
 // tree depth for bkd index
 DECLARE_Int32(max_depth_in_bkd_tree);

@@ -39,6 +39,7 @@
 #include "storage/index/inverted/analyzer/basic/basic_analyzer.h"
 #include "storage/index/inverted/analyzer/icu/icu_analyzer.h"
 #include "storage/index/inverted/analyzer/ik/IKAnalyzer.h"
+#include "storage/index/inverted/analyzer/kuromoji/KuromojiAnalyzer.h"
 #include "storage/index/inverted/char_filter/char_replace_char_filter_factory.h"
 
 namespace doris::segment_v2::inverted_index {
@@ -69,7 +70,8 @@ bool InvertedIndexAnalyzer::is_builtin_analyzer(const std::string& analyzer_name
            analyzer_name == INVERTED_INDEX_PARSER_CHINESE ||
            analyzer_name == INVERTED_INDEX_PARSER_ICU ||
            analyzer_name == INVERTED_INDEX_PARSER_BASIC ||
-           analyzer_name == INVERTED_INDEX_PARSER_IK;
+           analyzer_name == INVERTED_INDEX_PARSER_IK ||
+           analyzer_name == INVERTED_INDEX_PARSER_KUROMOJI;
 }
 
 AnalyzerPtr InvertedIndexAnalyzer::create_builtin_analyzer(InvertedIndexParserType parser_type,
@@ -107,6 +109,17 @@ AnalyzerPtr InvertedIndexAnalyzer::create_builtin_analyzer(InvertedIndexParserTy
             ik_analyzer->setMode(false);
         }
         analyzer = std::move(ik_analyzer);
+    } else if (parser_type == InvertedIndexParserType::PARSER_KUROMOJI) {
+        if (!config::enable_kuromoji_analyzer) {
+            throw Exception(ErrorCode::INVERTED_INDEX_ANALYZER_ERROR,
+                            "kuromoji analyzer is disabled by default. Set "
+                            "enable_kuromoji_analyzer=true in "
+                            "be.conf (or via the BE config HTTP API) to enable it.");
+        }
+        auto kuromoji_analyzer = std::make_shared<KuromojiAnalyzer>();
+        kuromoji_analyzer->initDict(config::inverted_index_dict_path + "/kuromoji");
+        kuromoji_analyzer->setMode(kuromoji_mode_from_string(parser_mode));
+        analyzer = std::move(kuromoji_analyzer);
     } else {
         // default
         analyzer = std::make_shared<lucene::analysis::SimpleAnalyzer<char>>();

@@ -0,0 +1,73 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include "common/logging.h"
+#include "storage/index/inverted/analyzer/kuromoji/KuromojiTokenizer.h"
+#include "storage/index/inverted/analyzer/kuromoji/dict/kuromoji_dictionary.h"
+
+namespace doris::segment_v2 {
+
+class KuromojiAnalyzer : public Analyzer {
+public:
+    KuromojiAnalyzer() {
+        _lowercase = true;
+        _ownReader = false;
+    }
+    ~KuromojiAnalyzer() override = default;
+
+    bool isSDocOpt() override { return true; }
+
+    // Loads (once, process-wide) the IPADIC dictionary from `dictPath`. If it is
+    // unavailable the tokenizer degrades to a per-codepoint split (logged), rather
+    // than failing index/query.
+    void initDict(const std::string& dictPath) override {
+        dict_ = kuromoji::KuromojiDictionary::get_or_load(dictPath);
+        if (dict_ == nullptr) {
+            LOG(WARNING) << "kuromoji: dictionary unavailable at " << dictPath
+                         << "; falling back to per-codepoint tokenization";
+        }
+    }
+
+    void setMode(KuromojiMode mode) { mode_ = mode; }
+
+    TokenStream* tokenStream(const TCHAR* fieldName, lucene::util::Reader* reader) override {
+        auto* tokenizer = _CLNEW KuromojiTokenizer(mode_, _lowercase, _ownReader, dict_);
+        tokenizer->reset(reader);
+        return (TokenStream*)tokenizer;
+    }
+
+    TokenStream* reusableTokenStream(const TCHAR* fieldName,
+                                     lucene::util::Reader* reader) override {
+        if (tokenizer_ == nullptr) {
+            tokenizer_ = std::make_unique<KuromojiTokenizer>(mode_, _lowercase, _ownReader, dict_);
+        }
+        tokenizer_->reset(reader);
+        return (TokenStream*)tokenizer_.get();
+    }
+
+private:
+    const kuromoji::KuromojiDictionary* dict_ {nullptr};
+    KuromojiMode mode_ {KuromojiMode::Search};
+    std::unique_ptr<KuromojiTokenizer> tokenizer_;
+};
+
+} // namespace doris::segment_v2
@@ -0,0 +1,41 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <string>
+
+namespace doris::segment_v2 {
+
+// Segmentation mode, mirroring Lucene's JapaneseTokenizer.Mode. Normal returns
+// the minimum-cost segmentation. Search additionally decomposes long compounds
+// into their shorter parts (via a length-based cost penalty) for better search
+// recall. Extended applies the Search penalty and also splits unknown
+// (out-of-vocabulary) words into per-character unigrams.
+enum class KuromojiMode { Normal, Search, Extended };
+
+inline KuromojiMode kuromoji_mode_from_string(const std::string& mode) {
+    if (mode == "normal") {
+        return KuromojiMode::Normal;
+    }
+    if (mode == "extended") {
+        return KuromojiMode::Extended;
+    }
+    return KuromojiMode::Search; // default (matches OpenSearch/Lucene)
+}
+
+} // namespace doris::segment_v2