feat(synced-lyrics): thai romanization (#3618)

Co-authored-by: Angelos Bouklis <me@arjix.dev>
Co-authored-by: JellyBrick <shlee1503@naver.com>
This commit is contained in:
hax0r31337
2025-09-05 15:30:39 +08:00
committed by GitHub
parent fd68c204f6
commit 588b84ecd0
3 changed files with 62 additions and 29 deletions

View File

@ -62,6 +62,8 @@
"neverBuiltDependencies": []
},
"dependencies": {
"@dehoist/romanize-thai": "1.0.0",
"@electron-toolkit/tsconfig": "1.0.1",
"@electron/remote": "2.1.3",
"@ffmpeg.wasm/core-mt": "0.12.0",
"@ffmpeg.wasm/main": "0.12.0",

14
pnpm-lock.yaml generated
View File

@ -33,6 +33,12 @@ importers:
.:
dependencies:
'@dehoist/romanize-thai':
specifier: 1.0.0
version: 1.0.0
'@electron-toolkit/tsconfig':
specifier: 1.0.1
version: 1.0.1(@types/node@24.3.0)
'@electron/remote':
specifier: 2.1.3
version: 2.1.3(electron@38.0.0)
@ -241,9 +247,6 @@ importers:
specifier: 4.1.5
version: 4.1.5
devDependencies:
'@electron-toolkit/tsconfig':
specifier: 1.0.1
version: 1.0.1(@types/node@24.3.0)
'@eslint/js':
specifier: 9.34.0
version: 9.34.0
@ -473,6 +476,9 @@ packages:
'@bufbuild/protobuf@2.6.3':
resolution: {integrity: sha512-w/gJKME9mYN7ZoUAmSMAWXk4hkVpxRKvEJCb3dV5g9wwWdxTJJ0ayOJAVcNxtdqaxDyFuC0uz4RSGVacJ030PQ==}
'@dehoist/romanize-thai@1.0.0':
resolution: {integrity: sha512-6SqD4vyZ48otnypLXMh901CeQetoP5ptYOaIr58N6zDqjjoN0bHszMb5d/6AXJJQf8kIvbmSWBeuDrbAWLajPQ==}
'@develar/schema-utils@2.6.5':
resolution: {integrity: sha512-0cp4PsWQ/9avqTVMCtZ+GirikIA36ikvjtHweU4/j8yLtgObI0+JUPhYFScgwlteveGB1rt3Cm8UhN04XayDig==}
engines: {node: '>= 8.9.0'}
@ -4909,6 +4915,8 @@ snapshots:
'@bufbuild/protobuf@2.6.3': {}
'@dehoist/romanize-thai@1.0.0': {}
'@develar/schema-utils@2.6.5':
dependencies:
ajv: 6.12.6

View File

@ -1,15 +1,11 @@
import { render } from 'solid-js/web';
import KuromojiAnalyzer from 'kuroshiro-analyzer-kuromoji';
import Kuroshiro from 'kuroshiro';
import { romanize as esHangulRomanize } from 'es-hangul';
import hanja from 'hanja';
import pinyin from 'tiny-pinyin';
import * as pinyin from 'tiny-pinyin';
import { romanize as romanizeThaiFrag } from '@dehoist/romanize-thai';
import { lazy } from 'lazy-var';
import { detect } from 'tinyld';
import { waitForElement } from '@/utils/wait-for-element';
@ -155,26 +151,9 @@ const hasKorean = (lines: string[]) =>
const hasChinese = (lines: string[]) =>
lines.some((line) => /[\u4E00-\u9FFF]+/.test(line));
export const romanize = async (line: string) => {
const lang = detect(line);
const handlers: Record<string, (line: string) => Promise<string> | string> = {
ja: romanizeJapanese,
ko: romanizeHangul,
zh: romanizeChinese,
};
const NO_OP = (l: string) => l;
const handler = handlers[lang] ?? NO_OP;
line = await handler(line);
if (hasJapanese([line])) line = await romanizeJapanese(line);
if (hasKorean([line])) line = romanizeHangul(line);
if (hasChinese([line])) line = romanizeChinese(line);
return line;
};
// https://en.wikipedia.org/wiki/Thai_(Unicode_block)
const hasThai = (lines: string[]) =>
lines.some((line) => /[\u0E00-\u0E7F]+/.test(line));
export const romanizeJapanese = async (line: string) =>
(await kuroshiro.get()).convert(line, {
@ -190,3 +169,47 @@ export const romanizeChinese = (line: string) => {
pinyin.convertToPinyin(match, ' ', true),
);
};
const thaiSegmenter = Intl.Segmenter.supportedLocalesOf('th').includes('th')
? new Intl.Segmenter('th', { granularity: 'word' })
: null;
export const romanizeThai = (line: string) => {
if (!thaiSegmenter) return romanizeThaiFrag(line);
const segments = Array.from(thaiSegmenter.segment(line));
const latin = segments
.map((segment) =>
segment.isWordLike
? romanizeThaiFrag(segment.segment)
: segment.segment.trim(),
)
.join(' ')
.trim();
return latin;
};
const handlers: Record<string, (line: string) => Promise<string> | string> = {
ja: romanizeJapanese,
ko: romanizeHangul,
zh: romanizeChinese,
th: romanizeThai,
};
export const romanize = async (line: string) => {
const lang = detect(line);
const handler = handlers[lang];
if (handler) {
return handler(line);
}
// fallback
if (hasJapanese([line])) line = await romanizeJapanese(line);
if (hasKorean([line])) line = romanizeHangul(line);
if (hasChinese([line])) line = romanizeChinese(line);
if (hasThai([line])) line = romanizeThai(line);
return line;
};