Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions python/infinity_sdk/infinity/rag_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -570,11 +570,16 @@ def naive_qie(txt):
parser.add_argument('--fine-grained', action='store_true',
help='Use fine-grained tokenization')
parser.add_argument('--user-dict', help='User dictionary file')
parser.add_argument('-l', '--language', help='Language for stemming (e.g., english, dutch)')

args = parser.parse_args()

tokenizer = RagTokenizer(debug=True, user_dict=args.user_dict)

# Set language if specified
if args.language:
tokenizer.set_language(args.language)

# Process input
if args.file:
# File mode
Expand Down
7 changes: 6 additions & 1 deletion src/common/analyzer/rag_analyzer.cppm
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import :darts_trie;
import :stemmer;
import :analyzer;
import :wordnet_lemmatizer;
import :logger;

import third_party;

Expand All @@ -41,7 +42,9 @@ public:

~RAGAnalyzer();

void InitStemmer(Language language) { stemmer_->Init(language); }
void InitStemmer(Language language);

void SetLanguage(const std::string &language);

Status Load();

Expand Down Expand Up @@ -132,6 +135,8 @@ public:

WordNetLemmatizer *wordnet_lemma_{nullptr};

bool use_lemmatizer_{true}; // WordNet only supports English

std::unique_ptr<Stemmer> stemmer_;

OpenCC *opencc_{nullptr};
Expand Down
92 changes: 84 additions & 8 deletions src/common/analyzer/rag_analyzer_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,28 @@ static const std::string WORDNET_PATH = "wordnet";

static const std::string OPENCC_PATH = "opencc";

// Map language names (lowercase) to Stemmer Language enum.
// Used by SetLanguage() to configure language-specific stemming.
static const std::pair<std::string, Language> SNOWBALL_LANGUAGE_MAP[] = {
{"english", STEM_LANG_ENGLISH},
{"dutch", STEM_LANG_DUTCH},
{"german", STEM_LANG_GERMAN},
{"french", STEM_LANG_FRENCH},
{"spanish", STEM_LANG_SPANISH},
{"italian", STEM_LANG_ITALIAN},
{"portuguese", STEM_LANG_PORTUGUESE},
{"portuguese br", STEM_LANG_PORTUGUESE},
{"russian", STEM_LANG_RUSSIAN},
{"arabic", STEM_LANG_UNKNOWN}, // No Arabic entry in Language enum (stemmer.cppm), so use UNKNOWN to keep default stemming behavior
{"danish", STEM_LANG_DANISH},
{"finnish", STEM_LANG_FINNISH},
{"hungarian", STEM_LANG_HUNGARIAN},
{"norwegian", STEM_LANG_NORWEGIAN},
{"romanian", STEM_LANG_ROMANIAN},
{"swedish", STEM_LANG_SWEDISH},
{"turkish", STEM_LANG_TURKISH},
};
Comment thread
qinling0210 marked this conversation as resolved.

static const std::string REGEX_SPLIT_CHAR =
R"#(([ ,\.<>/?;'\[\]\`!@#$%^&*$$\{\}\|_+=《》,。?、;‘’:“”【】~!¥%……()——-]+|[a-zA-Z\.-]+|[0-9,\.-]+))#";

Expand Down Expand Up @@ -662,6 +684,40 @@ RAGAnalyzer::~RAGAnalyzer() {
}
}

void RAGAnalyzer::InitStemmer(Language language) {
stemmer_->Init(language);
use_lemmatizer_ = (language == STEM_LANG_ENGLISH);
}

void RAGAnalyzer::SetLanguage(const std::string &language) {
std::string lang_key = language;
// Convert to lowercase
std::transform(lang_key.begin(), lang_key.end(), lang_key.begin(), [](unsigned char c) { return std::tolower(c); });
// Trim whitespace
lang_key.erase(lang_key.find_last_not_of(" \t") + 1);
lang_key.erase(0, lang_key.find_first_not_of(" \t"));

Language stem_lang = STEM_LANG_UNKNOWN;
std::string snowball_lang;
for (const auto &pair : SNOWBALL_LANGUAGE_MAP) {
if (pair.first == lang_key) {
stem_lang = pair.second;
snowball_lang = pair.first;
break;
}
}

if (stem_lang != STEM_LANG_UNKNOWN) {
stemmer_->Init(stem_lang);
use_lemmatizer_ = (stem_lang == STEM_LANG_ENGLISH);
LOG_DEBUG(fmt::format("Tokenizer language set to '{}' (Snowball: {}, lemmatizer: {})", language, snowball_lang, use_lemmatizer_));
} else {
// Unsupported language (Chinese, Japanese, Korean, etc.) –
// keep defaults. CJK text uses dictionary segmentation, not stemming.
LOG_DEBUG(fmt::format("Language '{}' has no Snowball stemmer; keeping defaults", language));
}
}
Comment on lines +687 to +719
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Memory leak: re-Init without DeInit leaks the prior StemFunc and SN_env.

Stemmer::Init unconditionally does stem_function_ = static_cast<void *>(new StemFunc); ... ->env = ...->create(); (see stemmer_impl.cpp lines 87–113). Because the constructor already calls InitStemmer(STEM_LANG_ENGLISH), every subsequent SetLanguage(...) (e.g., the new Dutch/Arabic tests) leaks the previous StemFunc heap allocation and its associated SN_env. Call DeInit first (or fix Stemmer::Init to deinit before re-init).

🛡️ Proposed fix
 void RAGAnalyzer::InitStemmer(Language language) {
+    stemmer_->DeInit();
     stemmer_->Init(language);
     use_lemmatizer_ = (language == STEM_LANG_ENGLISH);
 }
@@
     if (stem_lang != STEM_LANG_UNKNOWN) {
+        stemmer_->DeInit();
         stemmer_->Init(stem_lang);
         use_lemmatizer_ = (stem_lang == STEM_LANG_ENGLISH);

Alternatively, fix the root cause inside Stemmer::Init by invoking DeInit() at the start before allocating a new StemFunc.

📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
void RAGAnalyzer::InitStemmer(Language language) {
stemmer_->Init(language);
use_lemmatizer_ = (language == STEM_LANG_ENGLISH);
}
void RAGAnalyzer::SetLanguage(const std::string &language) {
std::string lang_key = language;
// Convert to lowercase
std::transform(lang_key.begin(), lang_key.end(), lang_key.begin(), [](unsigned char c) { return std::tolower(c); });
// Trim whitespace
lang_key.erase(lang_key.find_last_not_of(" \t") + 1);
lang_key.erase(0, lang_key.find_first_not_of(" \t"));
Language stem_lang = STEM_LANG_UNKNOWN;
std::string snowball_lang;
for (const auto &pair : SNOWBALL_LANGUAGE_MAP) {
if (pair.first == lang_key) {
stem_lang = pair.second;
snowball_lang = pair.first;
break;
}
}
if (stem_lang != STEM_LANG_UNKNOWN) {
stemmer_->Init(stem_lang);
use_lemmatizer_ = (stem_lang == STEM_LANG_ENGLISH);
LOG_DEBUG(fmt::format("Tokenizer language set to '{}' (Snowball: {}, lemmatizer: {})", language, snowball_lang, use_lemmatizer_));
} else {
// Unsupported language (Chinese, Japanese, Korean, etc.) –
// keep defaults. CJK text uses dictionary segmentation, not stemming.
LOG_DEBUG(fmt::format("Language '{}' has no Snowball stemmer; keeping defaults", language));
}
}
void RAGAnalyzer::InitStemmer(Language language) {
stemmer_->DeInit();
stemmer_->Init(language);
use_lemmatizer_ = (language == STEM_LANG_ENGLISH);
}
void RAGAnalyzer::SetLanguage(const std::string &language) {
std::string lang_key = language;
// Convert to lowercase
std::transform(lang_key.begin(), lang_key.end(), lang_key.begin(), [](unsigned char c) { return std::tolower(c); });
// Trim whitespace
lang_key.erase(lang_key.find_last_not_of(" \t") + 1);
lang_key.erase(0, lang_key.find_first_not_of(" \t"));
Language stem_lang = STEM_LANG_UNKNOWN;
std::string snowball_lang;
for (const auto &pair : SNOWBALL_LANGUAGE_MAP) {
if (pair.first == lang_key) {
stem_lang = pair.second;
snowball_lang = pair.first;
break;
}
}
if (stem_lang != STEM_LANG_UNKNOWN) {
stemmer_->DeInit();
stemmer_->Init(stem_lang);
use_lemmatizer_ = (stem_lang == STEM_LANG_ENGLISH);
LOG_DEBUG(fmt::format("Tokenizer language set to '{}' (Snowball: {}, lemmatizer: {})", language, snowball_lang, use_lemmatizer_));
} else {
// Unsupported language (Chinese, Japanese, Korean, etc.) –
// keep defaults. CJK text uses dictionary segmentation, not stemming.
LOG_DEBUG(fmt::format("Language '{}' has no Snowball stemmer; keeping defaults", language));
}
}
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@src/common/analyzer/rag_analyzer_impl.cpp` around lines 687 - 719,
SetLanguage/InitStemmer call stemmer_->Init repeatedly which leaks the previous
StemFunc/SN_env; fix by ensuring the previous state is deinitialized before
re-initializing: either call stemmer_->DeInit() at the start of
RAGAnalyzer::SetLanguage (and/or RAGAnalyzer::InitStemmer) before calling
stemmer_->Init(stem_lang) or modify Stemmer::Init to invoke DeInit() at its top
so Init is safe to call multiple times; update references in
RAGAnalyzer::SetLanguage, RAGAnalyzer::InitStemmer and Stemmer::Init/DeInit
accordingly.


Status RAGAnalyzer::Load() {
fs::path root(dict_path_);
fs::path dict_path(root / DICT_PATH);
Expand Down Expand Up @@ -1332,9 +1388,14 @@ void RAGAnalyzer::EnglishNormalize(const std::vector<std::string> &tokens, std::
// Apply lowercase before lemmatization to match Python NLTK behavior
char *lowercase_term = lowercase_string_buffer_.data();
ToLower(t.c_str(), t.size(), lowercase_term, term_string_buffer_limit_);
std::string lemma_term = wordnet_lemma_->Lemmatize(lowercase_term);
std::string term_to_stem;
if (use_lemmatizer_) {
term_to_stem = wordnet_lemma_->Lemmatize(lowercase_term);
} else {
term_to_stem = lowercase_term;
}
std::string stem_term;
stemmer_->Stem(lemma_term, stem_term);
stemmer_->Stem(term_to_stem, stem_term);
res.push_back(stem_term);
} else {
res.push_back(t);
Expand Down Expand Up @@ -1694,9 +1755,14 @@ std::string RAGAnalyzer::Tokenize(const std::string &line) {
// Apply lowercase before lemmatization to match Python NLTK behavior
char *lowercase_term = lowercase_string_buffer_.data();
ToLower(term_list[i].c_str(), term_list[i].size(), lowercase_term, term_string_buffer_limit_);
std::string lemma_term = wordnet_lemma_->Lemmatize(lowercase_term);
std::string term_to_stem;
if (use_lemmatizer_) {
term_to_stem = wordnet_lemma_->Lemmatize(lowercase_term);
} else {
term_to_stem = lowercase_term;
}
std::string stem_term;
stemmer_->Stem(lemma_term, stem_term);
stemmer_->Stem(term_to_stem, stem_term);
res.push_back(stem_term);
}
continue;
Expand Down Expand Up @@ -1811,9 +1877,14 @@ std::pair<std::vector<std::string>, std::vector<std::pair<unsigned, unsigned>>>
// Apply lowercase before lemmatization to match Python NLTK behavior
char *lowercase_term = lowercase_string_buffer_.data();
ToLower(term.c_str(), term.size(), lowercase_term, term_string_buffer_limit_);
std::string lemma_term = wordnet_lemma_->Lemmatize(lowercase_term);
std::string term_to_stem;
if (use_lemmatizer_) {
term_to_stem = wordnet_lemma_->Lemmatize(lowercase_term);
} else {
term_to_stem = lowercase_term;
}
std::string stem_term;
stemmer_->Stem(lemma_term, stem_term);
stemmer_->Stem(term_to_stem, stem_term);

tokens.push_back(stem_term);

Expand Down Expand Up @@ -2136,9 +2207,14 @@ void RAGAnalyzer::EnglishNormalizeWithPosition(const std::vector<std::string> &t
// Apply lowercase before lemmatization to match Python NLTK behavior
char *lowercase_term = lowercase_string_buffer_.data();
ToLower(token.c_str(), token.size(), lowercase_term, term_string_buffer_limit_);
std::string lemma_term = wordnet_lemma_->Lemmatize(lowercase_term);
std::string term_to_stem;
if (use_lemmatizer_) {
term_to_stem = wordnet_lemma_->Lemmatize(lowercase_term);
} else {
term_to_stem = lowercase_term;
}
std::string stem_term;
stemmer_->Stem(lemma_term, stem_term);
stemmer_->Stem(term_to_stem, stem_term);

normalize_tokens.push_back(stem_term);
normalize_positions.emplace_back(start_pos, end_pos);
Expand Down
30 changes: 30 additions & 0 deletions src/unit_test/common/analyzer/rag_analyzer_ut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -308,3 +308,33 @@ TEST_F(RAGAnalyzerTest, test_fine_grained_tokenize_consistency_with_python) {
}
infile.close();
}

TEST_F(RAGAnalyzerTest, test_set_language_dutch) {
if (!analyzer_) {
FAIL() << "RAGAnalyzer not loaded, skipping test";
}
// Dutch word "huizen" (houses) should stem to "huiz" with Dutch stemmer
// Compare C++ result with Python result
std::string python_cmd = "uv run " + rag_tokenizer_path_ + "/rag_tokenizer.py " + "-l dutch \"huizen\"";
std::cout << "Call Python tokenizer: " << python_cmd << std::endl;

FILE *pipe = popen(python_cmd.c_str(), "r");
std::string python_result;
char buffer[128];
if (pipe) {
while (fgets(buffer, sizeof(buffer), pipe) != nullptr) {
python_result += buffer;
}
pclose(pipe);
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.
// Remove trailing newline
python_result.erase(python_result.find_last_not_of(" \n\r\t") + 1);
std::cout << "Python 'huizen' tokenized (Dutch): " << python_result << std::endl;

analyzer_->SetLanguage("dutch");
std::string cxx_result = analyzer_->Tokenize("huizen");
std::cout << "C++ 'huizen' tokenized (Dutch): " << cxx_result << std::endl;

EXPECT_TRUE(cxx_result.find("huiz") != std::string::npos);
EXPECT_EQ(cxx_result, python_result);
}
Loading