oramasearch · thatjuan · Jun 26, 2026 · Feb 11, 2026 · Jun 26, 2026 · Jun 26, 2026
diff --git a/packages/orama/src/components/tokenizer/index.ts b/packages/orama/src/components/tokenizer/index.ts
@@ -2,7 +2,7 @@ import type { Optional } from '../../types.js'
 import { createError } from '../../errors.js'
 import { Stemmer, Tokenizer, DefaultTokenizerConfig } from '../../types.js'
 import { replaceDiacritics } from './diacritics.js'
-import { Language, SPLITTERS, SUPPORTED_LANGUAGES } from './languages.js'
+import { Language, SPLITTERS, SUPPORTED_LANGUAGES, LANGUAGES_WITH_SIGNIFICANT_DIACRITICS } from './languages.js'
 import { stemmer as english } from './english-stemmer.js'
 
 export interface DefaultTokenizer extends Tokenizer {
@@ -36,7 +36,9 @@ export function normalizeToken(this: DefaultTokenizer, prop: string, token: stri
     token = this.stemmer(token)
   }
 
-  token = replaceDiacritics(token)
+  if (!LANGUAGES_WITH_SIGNIFICANT_DIACRITICS.has(this.language)) {
+    token = replaceDiacritics(token)
+  }
   if (withCache) {
     this.normalizationCache.set(key, token)
   }

diff --git a/packages/orama/src/components/tokenizer/languages.ts b/packages/orama/src/components/tokenizer/languages.ts
@@ -28,6 +28,7 @@ export const STEMMERS: Record<string, string> = {
   tamil: 'ta',
   turkish: 'tr',
   ukrainian: 'uk',
+  vietnamese: 'vi',
   sanskrit: 'sk'
 }
 
@@ -61,6 +62,7 @@ export const SPLITTERS: Record<Language, RegExp> = {
   bulgarian: /[^a-z0-9а-яА-Я]+/gim,
   tamil: /[^a-z0-9அ-ஹ]+/gim,
   sanskrit: /[^a-z0-9A-Zāīūṛḷṃṁḥśṣṭḍṇṅñḻḹṝ]+/gim,
+  vietnamese: /[^a-z0-9A-ZáàảãạăắằẳẵặâấầẩẫậéèẻẽẹêếềểễệíìỉĩịóòỏõọôốồổỗộơớờởỡợúùủũụưứừửữựýỳỷỹỵđÁÀẢÃẠĂẮẰẲẴẶÂẤẦẨẪẬÉÈẺẼẸÊẾỀỂỄỆÍÌỈĨỊÓÒỎÕỌÔỐỒỔỖỘƠỚỜỞỠỢÚÙỦŨỤƯỨỪỬỮỰÝỲỶỸỴĐ_]+/gim,
   czech: /[^A-Z0-9a-zěščřžýáíéúůóťďĚŠČŘŽÝÁÍÉÓÚŮŤĎ-]+/gim
 }
 
@@ -71,3 +73,8 @@ export function getLocale(language: string | undefined) {
 }
 
 export type Language = (typeof SUPPORTED_LANGUAGES)[number]
+
+// Languages whose diacritics are semantically significant (e.g. Vietnamese tone marks).
+// Their tokens must not be folded to ASCII during tokenization, otherwise distinct
+// words collapse together (e.g. "tài" -> "tai") and search quality breaks.
+export const LANGUAGES_WITH_SIGNIFICANT_DIACRITICS = new Set<Language>(['vietnamese'])
diff --git a/packages/orama/tests/tokenizer.test.ts b/packages/orama/tests/tokenizer.test.ts
@@ -14,6 +14,7 @@ import { stemmer as spanishStemmer, language as spanishLanguage } from '@orama/s
 import { stemmer as swedishStemmer, language as swedishLanguage } from '@orama/stemmers/swedish'
 import { stemmer as ukrainianStemmer, language as ukrainianLanguage } from '@orama/stemmers/ukrainian'
 import { stemmer as tamilStemmer, language as tamilLanguage } from '@orama/stemmers/tamil'
+import { stemmer as vietnameseStemmer, language as vietnameseLanguage } from '@orama/stemmers/vietnamese'
 
 import { stopwords as danishStopwords } from '@orama/stopwords/danish'
 import { stopwords as dutchStopwords } from '@orama/stopwords/dutch'
@@ -29,6 +30,7 @@ import { stopwords as spanishStopwords } from '@orama/stopwords/spanish'
 import { stopwords as swedishStopwords } from '@orama/stopwords/swedish'
 import { stopwords as ukrainianStopwords } from '@orama/stopwords/ukrainian'
 import { stopwords as tamilStopwords } from '@orama/stopwords/tamil'
+import { stopwords as vietnameseStopwords } from '@orama/stopwords/vietnamese'
 
 import { createTokenizer } from '../src/components/tokenizer/index.js'
 
@@ -336,6 +338,23 @@ t.test('Tokenizer', async (t) => {
     t.strictSame(O2, ['я', 'приготувал', 'тістечк'])
   })
 
+  t.test('should tokenize and stem correctly in vietnamese', async (t) => {
+    const tokenizer = await createTokenizer({
+      language: vietnameseLanguage,
+      stemmer: vietnameseStemmer,
+      stopWords: vietnameseStopwords
+    })
+
+    const I1 = 'Tìm kiếm tài liệu trong thư viện'
+    const I2 = 'Học lập trình là một việc thú vị'
+
+    const O1 = tokenizer.tokenize(I1)
+    const O2 = tokenizer.tokenize(I2)
+
+    t.strictSame(O1, ['tìm', 'kiếm', 'tài', 'liệu', 'thư', 'viện'])
+    t.strictSame(O2, ['học', 'lập', 'trình', 'thú', 'vị'])
+  })
+
   t.test('should tokenize and stem correctly in bulgarian', async (t) => {
     const tokenizer = await createTokenizer({ language: bulgarianLanguage, stemmer: bulgarianStemmer, stopWords: [] })
 

diff --git a/packages/stemmers/lib/vi.js b/packages/stemmers/lib/vi.js
@@ -0,0 +1,8 @@
+/*
+ * Vietnamese is an isolating (analytic) language where words do not inflect.
+ * There is no morphological stemming needed - the stemmer returns the word as-is.
+ */
+
+export function stemmer(word) {
+  return word
+}
diff --git a/packages/stemmers/package.json b/packages/stemmers/package.json
@@ -142,6 +142,11 @@
       "import": "./dist/uk.js",
       "require": "./dist/uk.cjs"
     },
+    "./vietnamese": {
+      "types": "./dist/vi.d.ts",
+      "import": "./dist/vi.js",
+      "require": "./dist/vi.cjs"
+    },
     "./sanskrit": {
       "types": "./dist/sk.d.ts",
       "import": "./dist/sk.js",

diff --git a/packages/stemmers/scripts/build.js b/packages/stemmers/scripts/build.js
@@ -40,6 +40,7 @@ const stemmers = {
   tamil: 'ta',
   turkish: 'tr',
   ukrainian: 'uk',
+  vietnamese: 'vi',
   sanskrit: 'sk'
 }
 

diff --git a/packages/stopwords/lib/vi.js b/packages/stopwords/lib/vi.js
@@ -0,0 +1,85 @@
+export const stopwords = [
+  'bị',
+  'bởi',
+  'cả',
+  'các',
+  'cái',
+  'cần',
+  'càng',
+  'chẳng',
+  'chỉ',
+  'cho',
+  'chưa',
+  'chừng',
+  'có',
+  'cùng',
+  'cũng',
+  'của',
+  'da',
+  'dù',
+  'dưới',
+  'đã',
+  'đang',
+  'đây',
+  'để',
+  'đến',
+  'đều',
+  'điều',
+  'do',
+  'đó',
+  'được',
+  'gì',
+  'giữa',
+  'hầu',
+  'hết',
+  'hiện',
+  'hoặc',
+  'hơn',
+  'hơi',
+  'khi',
+  'không',
+  'là',
+  'lại',
+  'lên',
+  'lúc',
+  'mà',
+  'mỗi',
+  'một',
+  'này',
+  'nên',
+  'nếu',
+  'ngay',
+  'nhiều',
+  'như',
+  'nhưng',
+  'những',
+  'nơi',
+  'nữa',
+  'ở',
+  'phải',
+  'qua',
+  'ra',
+  'rất',
+  'rằng',
+  'rồi',
+  'sau',
+  'sẽ',
+  'so',
+  'sự',
+  'tại',
+  'theo',
+  'thì',
+  'trong',
+  'trên',
+  'trước',
+  'từ',
+  'từng',
+  'và',
+  'vẫn',
+  'vào',
+  'vậy',
+  'vì',
+  'việc',
+  'với',
+  'vừa',
+]
diff --git a/packages/stopwords/package.json b/packages/stopwords/package.json
@@ -157,6 +157,11 @@
       "import": "./dist/uk.js",
       "require": "./dist/uk.cjs"
     },
+    "./vietnamese": {
+      "types": "./dist/vi.d.ts",
+      "import": "./dist/vi.js",
+      "require": "./dist/vi.cjs"
+    },
     "./sanskrit": {
       "types": "./dist/sk.d.ts",
       "import": "./dist/sk.js",

diff --git a/packages/stopwords/scripts/build.js b/packages/stopwords/scripts/build.js
@@ -37,6 +37,7 @@ const stemmers = {
   tamil: 'ta',
   turkish: 'tr',
   ukrainian: 'uk',
+  vietnamese: 'vi',
   sanskrit: 'sk'
 }