Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions packages/orama/src/components/tokenizer/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import type { Optional } from '../../types.js'
import { createError } from '../../errors.js'
import { Stemmer, Tokenizer, DefaultTokenizerConfig } from '../../types.js'
import { replaceDiacritics } from './diacritics.js'
import { Language, SPLITTERS, SUPPORTED_LANGUAGES } from './languages.js'
import { Language, SPLITTERS, SUPPORTED_LANGUAGES, LANGUAGES_WITH_SIGNIFICANT_DIACRITICS } from './languages.js'
import { stemmer as english } from './english-stemmer.js'

export interface DefaultTokenizer extends Tokenizer {
Expand Down Expand Up @@ -36,7 +36,9 @@ export function normalizeToken(this: DefaultTokenizer, prop: string, token: stri
token = this.stemmer(token)
}

token = replaceDiacritics(token)
if (!LANGUAGES_WITH_SIGNIFICANT_DIACRITICS.has(this.language)) {
token = replaceDiacritics(token)
}
if (withCache) {
this.normalizationCache.set(key, token)
}
Expand Down
7 changes: 7 additions & 0 deletions packages/orama/src/components/tokenizer/languages.ts
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ export const STEMMERS: Record<string, string> = {
tamil: 'ta',
turkish: 'tr',
ukrainian: 'uk',
vietnamese: 'vi',
sanskrit: 'sk'
}

Expand Down Expand Up @@ -61,6 +62,7 @@ export const SPLITTERS: Record<Language, RegExp> = {
bulgarian: /[^a-z0-9а-яА-Я]+/gim,
tamil: /[^a-z0-9அ-ஹ]+/gim,
sanskrit: /[^a-z0-9A-Zāīūṛḷṃṁḥśṣṭḍṇṅñḻḹṝ]+/gim,
vietnamese: /[^a-z0-9A-ZáàảãạăắằẳẵặâấầẩẫậéèẻẽẹêếềểễệíìỉĩịóòỏõọôốồổỗộơớờởỡợúùủũụưứừửữựýỳỷỹỵđÁÀẢÃẠĂẮẰẲẴẶÂẤẦẨẪẬÉÈẺẼẸÊẾỀỂỄỆÍÌỈĨỊÓÒỎÕỌÔỐỒỔỖỘƠỚỜỞỠỢÚÙỦŨỤƯỨỪỬỮỰÝỲỶỸỴĐ_]+/gim,
czech: /[^A-Z0-9a-zěščřžýáíéúůóťďĚŠČŘŽÝÁÍÉÓÚŮŤĎ-]+/gim
}

Expand All @@ -71,3 +73,8 @@ export function getLocale(language: string | undefined) {
}

export type Language = (typeof SUPPORTED_LANGUAGES)[number]

// Languages whose diacritics are semantically significant (e.g. Vietnamese tone marks).
// Their tokens must not be folded to ASCII during tokenization, otherwise distinct
// words collapse together (e.g. "tài" -> "tai") and search quality breaks.
export const LANGUAGES_WITH_SIGNIFICANT_DIACRITICS = new Set<Language>(['vietnamese'])
19 changes: 19 additions & 0 deletions packages/orama/tests/tokenizer.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import { stemmer as spanishStemmer, language as spanishLanguage } from '@orama/s
import { stemmer as swedishStemmer, language as swedishLanguage } from '@orama/stemmers/swedish'
import { stemmer as ukrainianStemmer, language as ukrainianLanguage } from '@orama/stemmers/ukrainian'
import { stemmer as tamilStemmer, language as tamilLanguage } from '@orama/stemmers/tamil'
import { stemmer as vietnameseStemmer, language as vietnameseLanguage } from '@orama/stemmers/vietnamese'

import { stopwords as danishStopwords } from '@orama/stopwords/danish'
import { stopwords as dutchStopwords } from '@orama/stopwords/dutch'
Expand All @@ -29,6 +30,7 @@ import { stopwords as spanishStopwords } from '@orama/stopwords/spanish'
import { stopwords as swedishStopwords } from '@orama/stopwords/swedish'
import { stopwords as ukrainianStopwords } from '@orama/stopwords/ukrainian'
import { stopwords as tamilStopwords } from '@orama/stopwords/tamil'
import { stopwords as vietnameseStopwords } from '@orama/stopwords/vietnamese'

import { createTokenizer } from '../src/components/tokenizer/index.js'

Expand Down Expand Up @@ -336,6 +338,23 @@ t.test('Tokenizer', async (t) => {
t.strictSame(O2, ['я', 'приготувал', 'тістечк'])
})

t.test('should tokenize and stem correctly in vietnamese', async (t) => {
const tokenizer = await createTokenizer({
language: vietnameseLanguage,
stemmer: vietnameseStemmer,
stopWords: vietnameseStopwords
})

const I1 = 'Tìm kiếm tài liệu trong thư viện'
const I2 = 'Học lập trình là một việc thú vị'

const O1 = tokenizer.tokenize(I1)
const O2 = tokenizer.tokenize(I2)

t.strictSame(O1, ['tìm', 'kiếm', 'tài', 'liệu', 'thư', 'viện'])
t.strictSame(O2, ['học', 'lập', 'trình', 'thú', 'vị'])
})

t.test('should tokenize and stem correctly in bulgarian', async (t) => {
const tokenizer = await createTokenizer({ language: bulgarianLanguage, stemmer: bulgarianStemmer, stopWords: [] })

Expand Down
8 changes: 8 additions & 0 deletions packages/stemmers/lib/vi.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
/*
* Vietnamese is an isolating (analytic) language where words do not inflect.
* There is no morphological stemming needed - the stemmer returns the word as-is.
*/

export function stemmer(word) {
return word
}
5 changes: 5 additions & 0 deletions packages/stemmers/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,11 @@
"import": "./dist/uk.js",
"require": "./dist/uk.cjs"
},
"./vietnamese": {
"types": "./dist/vi.d.ts",
"import": "./dist/vi.js",
"require": "./dist/vi.cjs"
},
"./sanskrit": {
"types": "./dist/sk.d.ts",
"import": "./dist/sk.js",
Expand Down
1 change: 1 addition & 0 deletions packages/stemmers/scripts/build.js
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ const stemmers = {
tamil: 'ta',
turkish: 'tr',
ukrainian: 'uk',
vietnamese: 'vi',
sanskrit: 'sk'
}

Expand Down
85 changes: 85 additions & 0 deletions packages/stopwords/lib/vi.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
export const stopwords = [
'bị',
'bởi',
'cả',
'các',
'cái',
'cần',
'càng',
'chẳng',
'chỉ',
'cho',
'chưa',
'chừng',
'có',
'cùng',
'cũng',
'của',
'da',
'dù',
'dưới',
'đã',
'đang',
'đây',
'để',
'đến',
'đều',
'điều',
'do',
'đó',
'được',
'gì',
'giữa',
'hầu',
'hết',
'hiện',
'hoặc',
'hơn',
'hơi',
'khi',
'không',
'là',
'lại',
'lên',
'lúc',
'mà',
'mỗi',
'một',
'này',
'nên',
'nếu',
'ngay',
'nhiều',
'như',
'nhưng',
'những',
'nơi',
'nữa',
'ở',
'phải',
'qua',
'ra',
'rất',
'rằng',
'rồi',
'sau',
'sẽ',
'so',
'sự',
'tại',
'theo',
'thì',
'trong',
'trên',
'trước',
'từ',
'từng',
'và',
'vẫn',
'vào',
'vậy',
'vì',
'việc',
'với',
'vừa',
]
5 changes: 5 additions & 0 deletions packages/stopwords/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,11 @@
"import": "./dist/uk.js",
"require": "./dist/uk.cjs"
},
"./vietnamese": {
"types": "./dist/vi.d.ts",
"import": "./dist/vi.js",
"require": "./dist/vi.cjs"
},
"./sanskrit": {
"types": "./dist/sk.d.ts",
"import": "./dist/sk.js",
Expand Down
1 change: 1 addition & 0 deletions packages/stopwords/scripts/build.js
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ const stemmers = {
tamil: 'ta',
turkish: 'tr',
ukrainian: 'uk',
vietnamese: 'vi',
sanskrit: 'sk'
}

Expand Down
Loading