import { render } from 'solid-js/web'; import KuromojiAnalyzer from 'kuroshiro-analyzer-kuromoji'; import Kuroshiro from 'kuroshiro'; import { romanize as esHangulRomanize } from 'es-hangul'; import hanja from 'hanja'; import * as pinyin from 'tiny-pinyin'; import { romanize as romanizeThaiFrag } from '@dehoist/romanize-thai'; import { lazy } from 'lazy-var'; import { detect } from 'tinyld'; import { waitForElement } from '@/utils/wait-for-element'; import { LyricsRenderer, setIsVisible } from './renderer'; export const selectors = { head: '#tabsContent > .tab-header:nth-of-type(2)', body: { tabRenderer: '#tab-renderer[page-type="MUSIC_PAGE_TYPE_TRACK_LYRICS"]', root: 'ytmusic-description-shelf-renderer', }, }; export const tabStates: Record void> = { true: async () => { setIsVisible(true); let container = document.querySelector('#synced-lyrics-container'); if (container) return; const tabRenderer = await waitForElement( selectors.body.tabRenderer, ); container = Object.assign(document.createElement('div'), { id: 'synced-lyrics-container', }); tabRenderer.appendChild(container); render(() => , container); }, false: () => { setIsVisible(false); }, }; export const canonicalize = (text: string) => { return ( text // `hi there` => `hi there` .replaceAll(/\s+/g, ' ') // `( a )` => `(a)` .replaceAll(/([([]) ([^ ])/g, (_, symbol, a) => `${symbol}${a}`) .replaceAll(/([^ ]) ([)\]])/g, (_, a, symbol) => `${a}${symbol}`) // `can ' t` => `can't` .replaceAll( /([Ii]) (') ([^ ])|(n) (') (t)(?= |$)|(t) (') (s)|([^ ]) (') (re)|([^ ]) (') (ve)|([^ ]) (-) ([^ ])/g, (m, ...groups) => { for (let i = 0; i < groups.length; i += 3) { if (groups[i]) { return groups.slice(i, i + 3).join(''); } } return m; }, ) // `Stayin ' still` => `Stayin' still` .replaceAll(/in ' ([^ ])/g, (_, char) => `in' ${char}`) .replaceAll("in ',", "in',") .replaceAll(", ' cause", ", 'cause") // `hi , there` => `hi, there` .replaceAll(/([^ ]) ([.,!?])/g, (_, a, symbol) => `${a}${symbol}`) // `hi " there "` => `hi "there"` .replaceAll( /"([^"]+)"/g, (_, content) => `"${typeof content === 'string' ? content.trim() : content}"`, ) .trim() ); }; export const simplifyUnicode = (text?: string) => text ? text .replaceAll(/\u0020|\u00A0|[\u2000-\u200A]|\u202F|\u205F|\u3000/g, ' ') .trim() : text; // Japanese Shinjitai const shinjitai = [ 20055, 20081, 20120, 20124, 20175, 26469, 20341, 20206, 20253, 20605, 20385, 20537, 20816, 20001, 20869, 23500, 28092, 20956, 21104, 21091, 21092, 21172, 21234, 21169, 21223, 21306, 24059, 21363, 21442, 21782, 21336, 22107, 21427, 22065, 22287, 22269, 22258, 20870, 22259, 22243, 37326, 23597, 22679, 22549, 22311, 22593, 22730, 22732, 22766, 22769, 23551, 22885, 22888, 23330, 23398, 23517, 23455, 20889, 23515, 23453, 23558, 23554, 23550, 23626, 23631, 23646, 23792, 23777, 23798, 23731, 24012, 24035, 24111, 24182, 24259, 24195, 24193, 24382, 24357, 24367, 24452, 24467, 24500, 24499, 24658, 24693, 24746, 24745, 24910, 24808, 24540, 25040, 24651, 25126, 25135, 25144, 25147, 25173, 25244, 25309, 25375, 25407, 25522, 25531, 25594, 25436, 25246, 25731, 25285, 25312, 25369, 25313, 25666, 25785, 21454, 21177, 21465, 21189, 25968, 26029, 26179, 26217, 26172, 26278, 26241, 26365, 20250, 26465, 26719, 26628, 27097, 27010, 27005, 27004, 26530, 27096, 27178, 26727, 26908, 26716, 27177, 27431, 27475, 27497, 27508, 24112, 27531, 27579, 27572, 27598, 27671, 28169, 28057, 27972, 27973, 28167, 28179, 28201, 28382, 28288, 28300, 28508, 28171, 27810, 28287, 28168, 27996, 27818, 28381, 28716, 28286, 28948, 28783, 28988, 21942, 28809, 20105, 28858, 29344, 29366, 29421, 22888, 29420, 29471, 29539, 29486, 24321, 29942, 30011, 24403, 30067, 30185, 30196, 30330, 26479, 30423, 23613, 30495, 30740, 30741, 30783, 31192, 31108, 31109, 31036, 31074, 31095, 31216, 31282, 38964, 31298, 31311, 31331, 31363, 20006, 31883, 31992, 32076, 32209, 32210, 32257, 30476, 32294, 32207, 32333, 32260, 32117, 32331, 32153, 32154, 32330, 27424, 32566, 22768, 32884, 31899, 33075, 32966, 33235, 21488, 19982, 26087, 33398, 33624, 33550, 33804, 19975, 33931, 22290, 34219, 34101, 33464, 34220, 33446, 20966, 34394, 21495, 34509, 34411, 34635, 34453, 34542, 34907, 35013, 35090, 35226, 35239, 35251, 35302, 35617, 35388, 35379, 35465, 35501, 22793, 35698, 35715, 35914, 33398, 20104, 24336, 22770, 38972, 36059, 36341, 36527, 36605, 36620, 36578, 24321, 36766, 24321, 36965, 36883, 36933, 36794, 37070, 37111, 37204, 21307, 37284, 37271, 37304, 37320, 37682, 37549, 37676, 37806, 37444, 37619, 37489, 38306, 38501, 38543, 38522, 38560, 21452, 38609, 35207, 38666, 38745, 39003, 38997, 32763, 20313, 39173, 39366, 39442, 39366, 39443, 39365, 39620, 20307, 39658, 38360, 40335, 40206, 40568, 22633, 40614, 40633, 40634, 40644, 40658, 40665, 28857, 20826, 25993, 25998, 27503, 40802, 31452, 20096, ].map((codePoint) => String.fromCodePoint(codePoint)); const shinjitaiRegex = new RegExp(`[${shinjitai.join('')}]`); const kuroshiro = lazy(async () => { const _kuroshiro = new Kuroshiro(); await _kuroshiro.init( new KuromojiAnalyzer({ dictPath: 'https://cdn.jsdelivr.net/npm/kuromoji@0.1.2/dict/', }), ); return _kuroshiro; }); const hasJapanese = (lines: string[]) => lines.some( (line) => Kuroshiro.Util.hasKana(line) || shinjitaiRegex.test(line), ); // tests for Hangul characters, sufficient for our use case const hasKorean = (lines: string[]) => lines.some((line) => /[ㄱ-ㅎㅏ-ㅣ가-힣]+/.test(line)); const hasChinese = (lines: string[]) => lines.some((line) => /[\u4E00-\u9FFF]+/.test(line)); // https://en.wikipedia.org/wiki/Thai_(Unicode_block) const hasThai = (lines: string[]) => lines.some((line) => /[\u0E00-\u0E7F]+/.test(line)); export const romanizeJapanese = async (line: string) => (await kuroshiro.get()).convert(line, { to: 'romaji', mode: 'spaced', }) ?? line; export const romanizeHangul = (line: string) => esHangulRomanize(hanja.translate(line, 'SUBSTITUTION')); export const romanizeChinese = (line: string) => { return line.replaceAll(/[\u4E00-\u9FFF]+/g, (match) => pinyin.convertToPinyin(match, ' ', true), ); }; const thaiSegmenter = Intl.Segmenter.supportedLocalesOf('th').includes('th') ? new Intl.Segmenter('th', { granularity: 'word' }) : null; export const romanizeThai = (line: string) => { if (!thaiSegmenter) return romanizeThaiFrag(line); const segments = Array.from(thaiSegmenter.segment(line)); const latin = segments .map((segment) => segment.isWordLike ? romanizeThaiFrag(segment.segment) : segment.segment.trim(), ) .join(' ') .trim(); return latin; }; const handlers: Record Promise | string> = { ja: romanizeJapanese, ko: romanizeHangul, zh: romanizeChinese, th: romanizeThai, }; export const romanize = async (line: string) => { const lang = detect(line); const handler = handlers[lang]; if (handler) { return handler(line); } // fallback if (hasJapanese([line])) line = await romanizeJapanese(line); if (hasKorean([line])) line = romanizeHangul(line); if (hasChinese([line])) line = romanizeChinese(line); if (hasThai([line])) line = romanizeThai(line); return line; };