feat(synced-lyrics): romanization (#2790)

* feat(synced-lyrics): init romanization!

* remove debug logs and add TODO

* feat(synced-lyrics/romanization): Mandarin!

* feat(synced-lyrics/romanization): improve japanese detection

* feat(synced-lyrics/romanization): Korean!

* qol(synced-lyrics/romanization): canonicalize punctuation and symbols

* feat(synced-lyrics/romanization): handle japanese+korean and korean+chinese lyrics

* revert formatting on electron.vite.config.mts

* feat(synced-lyrics/romanization): romanize plain lyrics

* apply fix by @kimjammer

* fix lockfile due to rebase

* feat(synced-lyrics): improve lyric processing and formatting;

* feat(synced-lyrics/romanization): add option to enable/disable romanization

* chore: move default value for --lyrics-duration to the declaration

* update lockfile

* fix: improvement

1. improved language detection logic
2. changed code to work in the renderer process

* fix: fix regression (canonicalize)

---------

Co-authored-by: JellyBrick <shlee1503@naver.com>
This commit is contained in:
Angelos Bouklis
2025-03-26 13:29:43 +02:00
committed by GitHub
parent 19fd0d61c6
commit 4b35a96778
19 changed files with 1304 additions and 239 deletions

View File

@ -1,8 +1,20 @@
import { render } from 'solid-js/web';
import KuromojiAnalyzer from 'kuroshiro-analyzer-kuromoji';
import Kuroshiro from 'kuroshiro';
import { romanize as esHangulRomanize } from 'es-hangul';
import hanja from 'hanja';
import pinyin from 'pinyin/esm/pinyin';
import { lazy } from 'lazy-var';
import { waitForElement } from '@/utils/wait-for-element';
import { LyricsRenderer, setIsVisible } from './renderer';
import type { LyricResult } from '@/plugins/synced-lyrics/types';
export const selectors = {
head: '#tabsContent > .tab-header:nth-of-type(2)',
body: {
@ -33,3 +45,148 @@ export const tabStates: Record<string, () => void> = {
setIsVisible(false);
},
};
export const canonicalize = (text: string) => {
return (
text
// `hi there` => `hi there`
.replaceAll(/\s+/g, ' ')
// `( a )` => `(a)`
.replaceAll(/([([]) ([^ ])/g, (_, symbol, a) => `${symbol}${a}`)
.replaceAll(/([^ ]) ([)\]])/g, (_, a, symbol) => `${a}${symbol}`)
// `can ' t` => `can't`
.replaceAll(
/([Ii]) (') ([^ ])|(n) (') (t)(?= |$)|(t) (') (s)|([^ ]) (') (re)|([^ ]) (') (ve)|([^ ]) (-) ([^ ])/g,
(m, ...groups) => {
for (let i = 0; i < groups.length; i += 3) {
if (groups[i]) {
return groups.slice(i, i + 3).join('');
}
}
return m;
},
)
// `Stayin ' still` => `Stayin' still`
.replaceAll(/in ' ([^ ])/g, (_, char) => `in' ${char}`)
.replaceAll("in ',", "in',")
.replaceAll(", ' cause", ", 'cause")
// `hi , there` => `hi, there`
.replaceAll(/([^ ]) ([.,!?])/g, (_, a, symbol) => `${a}${symbol}`)
// `hi " there "` => `hi "there"`
.replaceAll(
/"([^"]+)"/g,
(_, content) =>
`"${typeof content === 'string' ? content.trim() : content}"`,
)
.trim()
);
};
export const simplifyUnicode = (text?: string) =>
text
? text
.replaceAll(/\u0020|\u00A0|[\u2000-\u200A]|\u202F|\u205F|\u3000/g, ' ')
.trim()
: text;
// Japanese Shinjitai
const shinjitai = [
20055, 20081, 20120, 20124, 20175, 26469, 20341, 20206, 20253, 20605, 20385,
20537, 20816, 20001, 20869, 23500, 28092, 20956, 21104, 21091, 21092, 21172,
21234, 21169, 21223, 21306, 24059, 21363, 21442, 21782, 21336, 22107, 21427,
22065, 22287, 22269, 22258, 20870, 22259, 22243, 37326, 23597, 22679, 22549,
22311, 22593, 22730, 22732, 22766, 22769, 23551, 22885, 22888, 23330, 23398,
23517, 23455, 20889, 23515, 23453, 23558, 23554, 23550, 23626, 23631, 23646,
23792, 23777, 23798, 23731, 24012, 24035, 24111, 24182, 24259, 24195, 24193,
24382, 24357, 24367, 24452, 24467, 24500, 24499, 24658, 24693, 24746, 24745,
24910, 24808, 24540, 25040, 24651, 25126, 25135, 25144, 25147, 25173, 25244,
25309, 25375, 25407, 25522, 25531, 25594, 25436, 25246, 25731, 25285, 25312,
25369, 25313, 25666, 25785, 21454, 21177, 21465, 21189, 25968, 26029, 26179,
26217, 26172, 26278, 26241, 26365, 20250, 26465, 26719, 26628, 27097, 27010,
27005, 27004, 26530, 27096, 27178, 26727, 26908, 26716, 27177, 27431, 27475,
27497, 27508, 24112, 27531, 27579, 27572, 27598, 27671, 28169, 28057, 27972,
27973, 28167, 28179, 28201, 28382, 28288, 28300, 28508, 28171, 27810, 28287,
28168, 27996, 27818, 28381, 28716, 28286, 28948, 28783, 28988, 21942, 28809,
20105, 28858, 29344, 29366, 29421, 22888, 29420, 29471, 29539, 29486, 24321,
29942, 30011, 24403, 30067, 30185, 30196, 30330, 26479, 30423, 23613, 30495,
30740, 30741, 30783, 31192, 31108, 31109, 31036, 31074, 31095, 31216, 31282,
38964, 31298, 31311, 31331, 31363, 20006, 31883, 31992, 32076, 32209, 32210,
32257, 30476, 32294, 32207, 32333, 32260, 32117, 32331, 32153, 32154, 32330,
27424, 32566, 22768, 32884, 31899, 33075, 32966, 33235, 21488, 19982, 26087,
33398, 33624, 33550, 33804, 19975, 33931, 22290, 34219, 34101, 33464, 34220,
33446, 20966, 34394, 21495, 34509, 34411, 34635, 34453, 34542, 34907, 35013,
35090, 35226, 35239, 35251, 35302, 35617, 35388, 35379, 35465, 35501, 22793,
35698, 35715, 35914, 33398, 20104, 24336, 22770, 38972, 36059, 36341, 36527,
36605, 36620, 36578, 24321, 36766, 24321, 36965, 36883, 36933, 36794, 37070,
37111, 37204, 21307, 37284, 37271, 37304, 37320, 37682, 37549, 37676, 37806,
37444, 37619, 37489, 38306, 38501, 38543, 38522, 38560, 21452, 38609, 35207,
38666, 38745, 39003, 38997, 32763, 20313, 39173, 39366, 39442, 39366, 39443,
39365, 39620, 20307, 39658, 38360, 40335, 40206, 40568, 22633, 40614, 40633,
40634, 40644, 40658, 40665, 28857, 20826, 25993, 25998, 27503, 40802, 31452,
20096,
].map((codePoint) => String.fromCodePoint(codePoint));
const shinjitaiRegex = new RegExp(`[${shinjitai.join('')}]`);
const kuroshiro = lazy(async () => {
const _kuroshiro = new Kuroshiro();
await _kuroshiro.init(
new KuromojiAnalyzer({
dictPath: 'https://cdn.jsdelivr.net/npm/kuromoji@0.1.2/dict/',
}),
);
return _kuroshiro;
});
const hasJapanese = (lines: string[]) =>
lines.some(
(line) => Kuroshiro.Util.hasKana(line) || shinjitaiRegex.test(line),
);
// tests for Hangul characters, sufficient for our use case
const hasKorean = (lines: string[]) =>
lines.some((line) => /[ㄱ-ㅎㅏ-ㅣ가-힣]+/.test(line));
export const hasJapaneseInString = (lyric: LyricResult) => {
if (!lyric || (!lyric.lines && !lyric.lyrics)) return false;
const lines = Array.isArray(lyric.lines)
? lyric.lines.map(({ text }) => text)
: lyric.lyrics!.split('\n');
return hasJapanese(lines);
};
export const hasKoreanInString = (lyric: LyricResult) => {
if (!lyric || (!lyric.lines && !lyric.lyrics)) return false;
const lines = Array.isArray(lyric.lines)
? lyric.lines.map(({ text }) => text)
: lyric.lyrics!.split('\n');
return hasKorean(lines);
};
export const romanizeJapanese = async (line: string) =>
(await kuroshiro.get()).convert(line, {
to: 'romaji',
mode: 'spaced',
});
export const romanizeHangul = (line: string) =>
esHangulRomanize(hanja.translate(line, 'SUBSTITUTION'));
export const romanizeJapaneseOrHangul = async (line: string) =>
romanizeHangul(await romanizeJapanese(line));
export const romanizeChinese = (line: string) =>
pinyin(line, {
heteronym: true,
segment: true,
group: true,
})
.flat()
.join(' ');